def forward(self, inputs):
        self.retain_inputs((0, 1))
        x, gamma, beta = inputs
        x_layout, _, _ = self.input_layouts
        self.output_layouts = (x_layout,)

        self.axis = _compute_axis(x.ndim, gamma.ndim, self.axis)
        self.key_axis = _compute_key_axis(x.ndim, gamma.ndim, self.axis)

        x_shape = memory_layouts._transpose_shape(x.shape, x_layout, None)

        if all(x_shape[i] == 1 for i in self.axis):
            if 0 in self.axis:
                warnings.warn(
                    'A batch with no more than one sample has been given'
                    ' to F.batch_normalization. F.batch_normalization'
                    ' will always output a zero tensor for such batches.'
                    ' This could be caused by incorrect configuration in'
                    ' your code (such as running evaluation while'
                    ' chainer.config.train=True),'
                    ' but could also happen in the last batch of training'
                    ' if non-repeating iterator is used.',
                    UserWarning)
            else:
                warnings.warn(
                    'F.batch_normalization received a batch with single'
                    ' dimensions along all axes that are used for aggregating'
                    ' statistics. F.batch_normalization'
                    ' will always output a zero tensor for such batches.',
                    UserWarning)

        # TODO(niboshi): Refactor calculation of expander and axis into a
        # function and call it just before they are used.

        # expander inserts singleton dimensions to gamma and beta so that they
        # can be broadcasted with x.
        expander = [None for _ in range(x.ndim)]
        for i in self.key_axis:
            expander[i] = slice(None)
        expander = tuple(expander)
        expander = memory_layouts._transpose_shape(expander, None, x_layout)
        self.expander = expander

        xp = backend.get_array_module(x)

        self._impl = self._impl_selector(self, inputs)

        (
            y, y_layout, self.running_mean, self.running_var,
            self.mean, self.var, self.inv_std,
            self.forward_data) = (
                self._impl.forward(
                    axis=self.axis, gamma=gamma, x=x, x_layout=x_layout,
                    xp=xp, expander=expander, beta=beta, eps=self.eps,
                    decay=self.decay,
                    running_mean=self.running_mean,
                    running_var=self.running_var))

        self.output_layouts = (y_layout,)
        return y,
    def _forward_cudnn(self, x, W, b, input_layouts):
        x_layout, w_layout = input_layouts
        self.output_layouts = (x_layout, )
        n = len(x)

        _, c, _, _ = memory_layouts._transpose_shape(W.shape, w_layout, None)
        y_raw_shape = memory_layouts._transpose_shape(
            (n, c * self.groups, self.outh, self.outw), None, x_layout)

        y = cuda.cupy.empty(y_raw_shape, dtype=x.dtype)
        pad = (self.ph, self.pw)
        stride = (self.sy, self.sx)
        dilation = (self.dy, self.dx)
        deterministic = configuration.config.cudnn_deterministic
        auto_tune = configuration.config.autotune
        tensor_core = configuration.config.use_cudnn_tensor_core
        cudnn_x_layout = cuda._get_cudnn_tensor_layout_x(x_layout)
        cudnn_w_layout = cuda._get_cudnn_tensor_layout_w(w_layout)
        cuda.cudnn.convolution_backward_data(W,
                                             x,
                                             b,
                                             y,
                                             pad,
                                             stride,
                                             dilation,
                                             self.groups,
                                             deterministic=deterministic,
                                             auto_tune=auto_tune,
                                             tensor_core=tensor_core,
                                             d_layout=cudnn_x_layout,
                                             w_layout=cudnn_w_layout)

        return y,
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # only retain x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
            x_layout, w_layout = self.input_layouts
        else:
            x, W, b = inputs
            x_layout, w_layout, _ = self.input_layouts

        x_shape = memory_layouts._transpose_shape(x.shape, x_layout, None)
        w_shape = memory_layouts._transpose_shape(W.shape, w_layout, None)
        self._calc_out_size(x_shape, w_shape)
        self._set_cover_all(x_shape, w_shape)

        use_cudnn = (chainer.should_use_cudnn('>=auto') and not self.cover_all
                     and x.dtype == W.dtype
                     and ((self.dy == 1 and self.dx == 1) or
                          (_cudnn_version >= 6000
                           and not configuration.config.cudnn_deterministic))
                     and (self.groups <= 1 or _cudnn_version >= 7000))

        if use_cudnn:
            # cuDNN implementation
            return self._forward_cudnn(x, W, b, (x_layout, w_layout))

        elif self.groups > 1:
            return self._forward_grouped_convolution(x, W, b)

        else:
            return self._forward_gpu_core(x, W, b)
    def forward_cpu(self, inputs):
        if ((self.dy == 1 and self.dx == 1)
                and intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            self._use_ideep = True

        self.retain_inputs((0, 1))  # only retain x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
            x_layout, w_layout = self.input_layouts
        else:
            x, W, b = inputs
            x_layout, w_layout, _ = self.input_layouts

        x_shape = memory_layouts._transpose_shape(x.shape, x_layout, None)
        w_shape = memory_layouts._transpose_shape(W.shape, w_layout, None)
        self._calc_out_size(x_shape, w_shape)

        if self.groups > 1:
            # Grouped convolution implementation
            return self._forward_grouped_convolution(x, W, b)

        elif (intel64.should_use_ideep('>=auto')
              and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            self._use_ideep = True
            return self._forward_ideep(x, W, b)

        else:
            return self._forward_cpu_core(x, W, b)
Exemple #5
0
    def _forward_cudnn(self, x, gy):
        x_layout, gy_layout = self.input_layouts
        w_layout = self.w_layout

        w_raw_shape = memory_layouts._transpose_shape(self.W_shape, None,
                                                      w_layout)

        gW = cuda.cupy.empty(w_raw_shape, dtype=self.W_dtype)
        pad = (self.ph, self.pw)
        stride = (self.sy, self.sx)
        dilation = (self.dy, self.dx)
        deterministic = configuration.config.cudnn_deterministic
        auto_tune = configuration.config.autotune
        tensor_core = configuration.config.use_cudnn_tensor_core
        cudnn_x_layout = cuda._get_cudnn_tensor_layout_x(x_layout)
        cudnn_w_layout = cuda._get_cudnn_tensor_layout_w(w_layout)
        cuda.cudnn.convolution_backward_filter(x,
                                               gy,
                                               gW,
                                               pad,
                                               stride,
                                               dilation,
                                               self.groups,
                                               deterministic=deterministic,
                                               auto_tune=auto_tune,
                                               tensor_core=tensor_core,
                                               d_layout=cudnn_x_layout,
                                               w_layout=cudnn_w_layout)

        return gW,
Exemple #6
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
            x_layout, w_layout = self.input_layouts
        else:
            x, W, b = inputs
            x_layout, w_layout, _ = self.input_layouts

        x_shape = memory_layouts._transpose_shape(x.shape, x_layout, None)
        w_shape = memory_layouts._transpose_shape(W.shape, w_layout, None)

        n, _, h, w = x_shape
        out_c, _, kh, kw = w_shape
        out_h, out_w = self._get_out_size(x_shape, w_shape)
        y_raw_shape = memory_layouts._transpose_shape((n, out_c, out_h, out_w),
                                                      None, x_layout)

        y = cuda.cupy.empty(y_raw_shape, dtype=x.dtype)

        use_cudnn = (chainer.should_use_cudnn('>=auto') and not self.cover_all
                     and x.dtype == W.dtype
                     and ((self.dy == 1 and self.dx == 1)
                          or _cudnn_version >= 6000)
                     and (self.groups <= 1 or _cudnn_version >= 7000))

        if self.cudnn_fast and not use_cudnn:
            raise RuntimeError('\'cudnn_fast\' requires cuDNN to work')

        if use_cudnn:
            # cuDNN implementation
            return self._forward_cudnn(x, W, b, y, (x_layout, w_layout))

        elif self.groups > 1:
            return self._forward_grouped_convolution(x, W, b)

        else:
            return self._forward_gpu_core(x, W, b)