Example #1
0
    def _gpu_init(self):
        defines = {
            "LABELS":
            int(self.has_labels),
            "SAMPLE_SIZE":
            self.original_data.sample_size,
            "MAX_MINIBATCH_SIZE":
            self.max_minibatch_size,
            "original_data_dtype":
            numpy_dtype_to_opencl(self.original_data.dtype),
            "minibatch_data_dtype":
            numpy_dtype_to_opencl(self.minibatch_data.dtype)
        }
        defines.update(self.get_ocl_defines())

        self.build_program(defines,
                           "fullbatch_loader",
                           dtype=self.minibatch_data.dtype)
        self.assign_kernel("fill_minibatch_data_labels")

        if not self.has_labels:
            self.set_args(self.original_data, self.minibatch_data,
                          self.device.skip(2), self.shuffled_indices,
                          self.minibatch_indices)
        else:
            self.set_args(self.original_data, self.minibatch_data,
                          self.device.skip(2), self._mapped_original_labels_,
                          self.minibatch_labels, self.shuffled_indices,
                          self.minibatch_indices)
Example #2
0
 def get_ocl_defines(self):
     return {
         "TARGET": 1,
         "TARGET_SIZE": self.original_targets.sample_size,
         "original_target_dtype": numpy_dtype_to_opencl(self.original_targets.dtype),
         "minibatch_target_dtype": numpy_dtype_to_opencl(self.minibatch_targets.dtype),
     }
Example #3
0
    def _gpu_init(self):
        defines = {
            "LABELS": int(self.has_labels),
            "SAMPLE_SIZE": self.original_data.sample_size,
            "MAX_MINIBATCH_SIZE": self.max_minibatch_size,
            "original_data_dtype": numpy_dtype_to_opencl(self.original_data.dtype),
            "minibatch_data_dtype": numpy_dtype_to_opencl(self.minibatch_data.dtype),
        }
        defines.update(self.get_ocl_defines())

        self.build_program(defines, "fullbatch_loader", dtype=self.minibatch_data.dtype)
        self.assign_kernel("fill_minibatch_data_labels")

        if not self.has_labels:
            self.set_args(
                self.original_data,
                self.minibatch_data,
                self.device.skip(2),
                self.shuffled_indices,
                self.minibatch_indices,
            )
        else:
            self.set_args(
                self.original_data,
                self.minibatch_data,
                self.device.skip(2),
                self._mapped_original_labels_,
                self.minibatch_labels,
                self.shuffled_indices,
                self.minibatch_indices,
            )
Example #4
0
 def get_ocl_defines(self):
     return {
         "TARGET":
         1,
         "TARGET_SIZE":
         self.original_targets.sample_size,
         "original_target_dtype":
         numpy_dtype_to_opencl(self.original_targets.dtype),
         "minibatch_target_dtype":
         numpy_dtype_to_opencl(self.minibatch_targets.dtype)
     }
Example #5
0
    def _gpu_init(self):
        dtype = self.rdisp.dtype
        sample_size = self.mean.size

        defines = {
            "input_type": numpy_dtype_to_opencl(self.input.dtype),
            "mean_type": numpy_dtype_to_opencl(self.mean.dtype),
            "SAMPLE_SIZE": sample_size
        }
        self.build_program(defines, self.__class__.__name__, dtype=dtype)
        self.assign_kernel("normalize_mean_disp")
        self.set_args(self.input, self.mean, self.rdisp, self.output)
Example #6
0
    def _gpu_init(self):
        dtype = self.rdisp.dtype
        sample_size = self.mean.size

        defines = {
            "input_type": numpy_dtype_to_opencl(self.input.dtype),
            "mean_type": numpy_dtype_to_opencl(self.mean.dtype),
            "SAMPLE_SIZE": sample_size
        }
        self.build_program(defines, self.__class__.__name__, dtype=dtype)
        self.assign_kernel("normalize_mean_disp")
        self.set_args(self.input, self.mean, self.rdisp, self.output)
Example #7
0
    def _gpu_init(self):
        dtype = self.output.dtype
        block_size = min(self.err_output.shape[0], 128)
        if self.class_targets:
            self.sources_["mse_find_closest"] = {
                "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype)
            }

        self.build_program(cache_file_name="%s_%d_%d" %
                           (self.__class__.__name__, self.output.shape[0],
                            self.output.sample_size),
                           dtype=dtype,
                           max_batch_size=self.err_output.shape[0],
                           block_size=block_size,
                           output_size=self.err_output.sample_size,
                           root=self.root,
                           normalization=self.normalizer.MAPPING,
                           targets_number=self.class_targets.shape[0]
                           if self.class_targets else None,
                           coeffs=self.normalizer.coefficients)

        self.assign_kernel("evaluate_mse")
        self.set_args(self.output, self.target, self.skip_args(2),
                      self.metrics, self.mse.devmem, self.err_output)

        if self.labels and self.class_targets:
            assert (self.labels.dtype == self.n_err.dtype == numpy.int32)
            self.krn_find_closest_ = self.get_kernel("mse_find_closest")
            self.krn_find_closest_.set_args(self.output.devmem,
                                            self.class_targets.devmem,
                                            self.labels.devmem,
                                            self.n_err.devmem)

        return block_size
Example #8
0
    def _gpu_init(self):
        dtype = self.output.dtype
        block_size = min(self.err_output.shape[0], 128)
        if self.class_targets:
            self.sources_["mse_find_closest"] = {
                "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype)
            }

        self.build_program(
            cache_file_name="%s_%d_%d" % (self.__class__.__name__,
                                          self.output.shape[0],
                                          self.output.sample_size),
            dtype=dtype, max_batch_size=self.err_output.shape[0],
            block_size=block_size, output_size=self.err_output.sample_size,
            root=self.root, normalization=self.normalizer.MAPPING,
            targets_number=self.class_targets.shape[0] if self.class_targets
            else None, coeffs=self.normalizer.coefficients)

        self.assign_kernel("evaluate_mse")
        self.set_args(self.output, self.target, self.skip_args(2),
                      self.metrics, self.mse.devmem, self.err_output)

        if self.labels and self.class_targets:
            assert(self.labels.dtype == self.n_err.dtype == numpy.int32)
            self.krn_find_closest_ = self.get_kernel("mse_find_closest")
            self.krn_find_closest_.set_args(
                self.output.devmem,
                self.class_targets.devmem,
                self.labels.devmem,
                self.n_err.devmem)

        return block_size
Example #9
0
    def get_kernel_bs_vo(self, **kwargs):
        """Gets optimal block size and vector_opt
        flag for matrix multiplication.

        Parameters:
            dtype: numeric data type as string (float or double).
            kernel: hint for the name of the kernel for which the optimal
                    block sizes will be returned:
                    conv: convolutional forward propagation,
                    deconv: convolutional back propagation,
                    all other: simple matrix multiplication.
            precision: precision level for summation (0, 1, 2)
                       (defaults to root.common.engine.precision_level).

        Returns:
            BLOCK_SIZE, VECTOR_OPT
        """
        dtype = kwargs["dtype"]
        if type(dtype) != str:
            dtype = opencl_types.numpy_dtype_to_opencl(dtype)
        krnnme = kwargs.get("kernel", "matrix_multiplication")
        precision = kwargs.get("precision", root.common.engine.precision_level)
        krninfo = self.device_info.get(krnnme)
        if krninfo is None:
            # Benchmark for other kernel types is not implemented,
            # so only debug level here
            self.debug(
                "Kernel \"%s\" was not found, "
                "rolling back to block size for matrix_multiplication",
                krnnme)
            krnnme = "matrix_multiplication"
            krninfo = self.device_info.get(krnnme)
            if krninfo is None:
                bs = 8
                self.warning(
                    "krnnme = %s was not found, "
                    "will use block size %d", krnnme, bs)
                return bs, False
        typeinfo = krninfo.get(dtype)
        if typeinfo is None:
            bs = 8
            self.warning(
                "dtype = %s was not found with krnnme = %s, "
                "will use block size %d", dtype, krnnme, bs)
            return bs, False
        bs_dt = typeinfo.get(str(precision))
        while bs_dt is None and precision > 0:
            precision -= 1
            bs_dt = typeinfo.get(str(precision))
        if bs_dt is None:
            bs = 8
            self.warning(
                "precision = 0 was not found with krnnme = %s and dtype = %s, "
                "will use block size %d", krnnme, dtype, bs)
            return bs, False
        return bs_dt[0], bs_dt[1]
Example #10
0
 def _gpu_init(self):
     defines = {
         'etype': opencl_types.numpy_dtype_to_opencl(self.output.dtype),
     }
     self.build_program(
         defines, "%s_%d_%s" %
         (type(self).__name__, self.output.shape[0],
          "_".join(map(str, self.output.shape[1:]))), inputs=self.inputs)
     self.assign_kernel("join")
     self.set_args(self.output, *self.inputs)
Example #11
0
    def get_kernel_bs_vo(self, **kwargs):
        """Gets optimal block size and vector_opt
        flag for matrix multiplication.

        Parameters:
            dtype: numeric data type as string (float or double).
            kernel: hint for the name of the kernel for which the optimal
                    block sizes will be returned:
                    conv: convolutional forward propagation,
                    deconv: convolutional back propagation,
                    all other: simple matrix multiplication.
            precision: precision level for summation (0, 1, 2)
                       (defaults to root.common.engine.precision_level).

        Returns:
            BLOCK_SIZE, VECTOR_OPT
        """
        dtype = kwargs["dtype"]
        if type(dtype) != str:
            dtype = opencl_types.numpy_dtype_to_opencl(dtype)
        krnnme = kwargs.get("kernel", "matrix_multiplication")
        precision = kwargs.get("precision", root.common.engine.precision_level)
        krninfo = self.device_info.get(krnnme)
        if krninfo is None:
            # Benchmark for other kernel types is not implemented,
            # so only debug level here
            self.debug(
                "Kernel \"%s\" was not found, "
                "rolling back to block size for matrix_multiplication", krnnme)
            krnnme = "matrix_multiplication"
            krninfo = self.device_info.get(krnnme)
            if krninfo is None:
                bs = 8
                self.warning(
                    "krnnme = %s was not found, "
                    "will use block size %d", krnnme, bs)
                return bs, False
        typeinfo = krninfo.get(dtype)
        if typeinfo is None:
            bs = 8
            self.warning(
                "dtype = %s was not found with krnnme = %s, "
                "will use block size %d", dtype, krnnme, bs)
            return bs, False
        bs_dt = typeinfo.get(str(precision))
        while bs_dt is None and precision > 0:
            precision -= 1
            bs_dt = typeinfo.get(str(precision))
        if bs_dt is None:
            bs = 8
            self.warning(
                "precision = 0 was not found with krnnme = %s and dtype = %s, "
                "will use block size %d", krnnme, dtype, bs)
            return bs, False
        return bs_dt[0], bs_dt[1]
Example #12
0
 def build_program(self, defines=None, cache_file_name=None, dtype=None,
                   **kwargs):
     if cache_file_name is None:
         cache_file_name = self.name
     if not isinstance(cache_file_name, str):
         raise ValueError("cache_file_name must be a string")
     if dtype is None:
         dtype = root.common.engine.precision_type
     elif not isinstance(dtype, str):
         dtype = opencl_types.numpy_dtype_to_opencl(dtype)
     return self._backend_build_program_(
         defines, cache_file_name, dtype, kwargs)
Example #13
0
    def ocl_init(self):
        self.input.initialize(self.device)
        self.weights.initialize(self.device)
        self.winners.initialize(self.device)
        self.argmins.initialize(self.device)
        self._distances.initialize(self.device)
        self._coords.initialize(self.device)

        batch_size = self.input.mem.shape[0]
        chunk_size = self._neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self._neurons_number // 2 + 1
        self.argmin_group_size = int(
            numpy.ceil(float(self._neurons_number) / chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE':
            block_size,
            'VECTOR_OPT':
            int(bool(vector_opt)),
            'BATCH':
            batch_size,
            'SAMPLE_LENGTH':
            self._sample_length,
            'NEURONS_NUMBER':
            self._neurons_number,
            'CHUNK_SIZE':
            chunk_size,
            'GRADIENT_CHUNK_SIZE':
            self.device.max_group_size,
            'coord_type':
            "%s%d" % (opencl_types.numpy_dtype_to_opencl(
                self._coords.mem.dtype), self._coords.mem.shape[-1])
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines,
                           "%s_%d_%d_%d" %
                           (self.__class__.__name__, batch_size,
                            self._sample_length, self._neurons_number),
                           dtype=self.weights.mem.dtype)

        self.ocl_consts_ = numpy.zeros(1, dtype=self.weights.mem.dtype)

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.argmins.devmem,
                                   self.winners.devmem)

        self._krn_gravity_ = self.get_kernel("compute_gravity")
        self._krn_gravity_.set_args(self.argmins.devmem, self._coords.devmem)
        self._krn_gravity_.set_arg(3, self._distances.devmem)

        self._krn_apply_gradient_ = self.get_kernel("apply_gradient")
        self._krn_apply_gradient_.set_args(self.input.devmem,
                                           self._distances.devmem)
        self._krn_apply_gradient_.set_arg(3, self.weights.devmem)

        self._gs_distance = [
            roundup(self._neurons_number, block_size),
            roundup(batch_size, block_size)
        ]
        self._ls_distance = [block_size, block_size]
Example #14
0
    def ocl_init(self):
        self.input.initialize(self.device)
        self.weights.initialize(self.device)
        self.winners.initialize(self.device)
        self.argmins.initialize(self.device)
        self._distances.initialize(self.device)
        self._coords.initialize(self.device)

        batch_size = self.input.mem.shape[0]
        chunk_size = self._neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self._neurons_number // 2 + 1
        self.argmin_group_size = int(numpy.ceil(float(self._neurons_number) /
                                                chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE': block_size,
            'VECTOR_OPT': int(bool(vector_opt)),
            'BATCH': batch_size,
            'SAMPLE_LENGTH': self._sample_length,
            'NEURONS_NUMBER': self._neurons_number,
            'CHUNK_SIZE': chunk_size,
            'GRADIENT_CHUNK_SIZE': self.device.max_group_size,
            'coord_type':  "%s%d" %
            (opencl_types.numpy_dtype_to_opencl(self._coords.mem.dtype),
             self._coords.mem.shape[-1])
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines, "%s_%d_%d_%d" %
                           (self.__class__.__name__,
                            batch_size, self._sample_length,
                            self._neurons_number),
                           dtype=self.weights.mem.dtype)

        self.ocl_consts_ = numpy.zeros(1, dtype=self.weights.mem.dtype)

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.argmins.devmem,
                                   self.winners.devmem)

        self._krn_gravity_ = self.get_kernel("compute_gravity")
        self._krn_gravity_.set_args(self.argmins.devmem, self._coords.devmem)
        self._krn_gravity_.set_arg(3, self._distances.devmem)

        self._krn_apply_gradient_ = self.get_kernel("apply_gradient")
        self._krn_apply_gradient_.set_args(self.input.devmem,
                                           self._distances.devmem)
        self._krn_apply_gradient_.set_arg(3, self.weights.devmem)

        self._gs_distance = [
            roundup(self._neurons_number, block_size),
            roundup(batch_size, block_size)]
        self._ls_distance = [block_size, block_size]