Beispiel #1
0
    def __init__(
        self,
        ta=1,
        tpb=256,
        sm_size=48 * 256 * 4,
    ):
        super(GetDivOfAddressV2CUDA, self).__init__()
        self.ta = ta  # how many clusters each thread is responsible of
        self.tpb = tpb
        self.sm_size = sm_size
        assert ta * tpb * 8 <= sm_size

        with open(get_absolute_path("kernels", "GetDivOfAddressV2Kernel.cu"),
                  "r") as f:
            self.kernel = f.read()
        kernel = (self.kernel.replace("_TA_",
                                      str(ta)).replace("_TPB_", str(tpb)))

        self.fn = cp.RawKernel(
            kernel,
            'get_div_of_address',
            backend='nvcc',
            # options=('--maxrregcount=255',),
        )
        self.fn.max_dynamic_shared_size_bytes = ta * tpb * 8
    def __init__(
        self,
        de=16,
        dk=16,
        sm_size=48 * 256 * 4,
    ):
        super(ComputeCentroidsCUDA, self).__init__()
        self.de = de
        self.dk = dk
        assert dk * (de + 1) * 4 <= sm_size
        self.tpb = 256
        self.sm_size = sm_size

        with open(
                get_absolute_path("kmeans", "kernels",
                                  "ComputeCentroidsKernel.cu"), "r") as f:
            self.kernel = f.read()

        kernel = (self.kernel.replace("_DE_", str(de)).replace(
            "_DK_", str(dk)).replace("_TPB_", str(self.tpb)).replace(
                "_NITERS_", str(math.ceil(dk / self.tpb))))

        self.fn = cp.RawKernel(
            kernel,
            'compute_centroids',
            # options=('--maxrregcount=255',),
            # backend='nvcc',
        )

        self.fn.max_dynamic_shared_size_bytes = sm_size
Beispiel #3
0
    def __init__(
        self,
        tpb=256,
        sm_size=48 * 256 * 4,
    ):
        super(GetIOACUDA, self).__init__()
        self.tpb = tpb
        self.sm_size = sm_size

        with open(get_absolute_path("kernels", "GetIOAKernel.cu"), "r") as f:
            self.kernel = f.read()
        kernel = (self.kernel.replace("_TPB_", str(tpb)))

        self.fn = cp.RawKernel(
            kernel,
            'get_ioa',
            backend='nvcc',
            # options=('--maxrregcount=255',),
        )
Beispiel #4
0
    def __init__(
        self,
        m=8,
        k=256,
        n_cs=4,
        sm_size=48 * 256 * 4,
    ):
        super(ComputeProductCUDA, self).__init__()
        self.m = m
        self.k = k
        self.tpb = 256
        self.n_cs = n_cs
        self.sm_size = sm_size

        with open(get_absolute_path("kernels", "ComputeProductKernel.cu"),
                  "r") as f:
            self.kernel = f.read()

        cb1 = [
            f"      float Bval{i} = Bsh[(i * _NCS_ + {i}) * _K_ + int(Avals.d{i}) ];"
            for i in range(n_cs)
        ]
        cb2 = [f"      sum += Bval{i};" for i in range(n_cs)]
        codeblock = "\n".join(cb1) + "\n" + "\n".join(cb2)
        varnames = ", ".join([f"d{i}" for i in range(n_cs)])
        kernel = (self.kernel.replace("_CODEBLOCK_", codeblock).replace(
            "_VARNAMES_",
            varnames).replace("_M_", str(m)).replace("_K_", str(k)).replace(
                "_TPB_", str(self.tpb)).replace("_NCS_", str(n_cs)))
        # print(kernel.split('\n')[60:64])
        self.fn = cp.RawKernel(
            kernel,
            'compute_product',
            # options=('--maxrregcount=255',),
            # backend='nvcc',
        )

        self.fn.max_dynamic_shared_size_bytes = sm_size
Beispiel #5
0
    def __init__(
        self,
        tm=2,
        td=8,
    ):
        super(PQDecodeCUDA, self).__init__()
        self.tm = tm
        self.td = td
        self.tpb = 256
        self.sm_size = td * tm * 256 * 4

        with open(get_absolute_path("kernels", "PQDecodeKernel.cu"), "r") as f:
            self.kernel = f.read()

        kernel = (self.kernel.replace("_TD_", str(td)).replace(
            "_TM_", str(tm)).replace("_TPB_", str(self.tpb)))

        self.fn = cp.RawKernel(
            kernel,
            'pq_decode',
        )

        self.fn.max_dynamic_shared_size_bytes = self.sm_size
Beispiel #6
0
    def __init__(self, m=None, n=None, k=None, dim=None, distance="euclidean"):
        super(MaxSimCUDA, self).__init__()
        self.m = m
        self.n = n
        self.k = k
        self.dim = dim
        self.distance = distance
        with open(get_absolute_path("kmeans", "kernels", "MaxSimKernel.cu"),
                  'r') as f:
            self.kernel = f.read()

        if distance in ["euclidean", "l2"]:
            distfn = "thread_nseuclidean"
        elif distance in ["manhattan", "l1"]:
            distfn = "thread_nmanhattan"
        elif distance == "inner":
            distfn = "thread_matmul"
        elif distance == "cosine":
            print(
                "warning: input matrices will not be normalized, please normalize them manually for cosine similarity"
            )
            distfn = "thread_matmul"
        else:
            raise ValueError("unrecognized distance type")

        self.kernel = (self.kernel.replace(
            "_M_",
            str(m) if m else "M").replace(
                "_N_",
                str(n) if n else "N").replace(
                    "_K_",
                    str(k) if k else "K").replace(
                        "_DIM_",
                        str(dim) if dim else "DIM").replace(
                            "_DISTFN_", distfn))

        # self._raw_module = cp.RawModule(
        #   code=self.kernel,
        #   backend='nvcc',
        #   options=('--maxrregcount=128', '--use_fast_math'),
        # )
        self._fn_tt = cp.RawKernel(code=self.kernel,
                                   name="max_sim_tt",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_nn = cp.RawKernel(code=self.kernel,
                                   name="max_sim_nn",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_tn = cp.RawKernel(code=self.kernel,
                                   name="max_sim_tn",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_nt = cp.RawKernel(code=self.kernel,
                                   name="max_sim_nt",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))