Esempio n. 1
0
def ssb_kernel(processed4D,real_calibration,aperture,voltage):
    data_size = processed4D.shape
    wavelength = e_lambda(voltage)
    cutoff = aperture/wavelength
    four_y = np.fft.fftshift(np.fft.fftfreq(data_size[0], real_calibration))
    four_x = np.fft.fftshift(np.fft.fftfreq(data_size[1], real_calibration))
    Four_Y,Four_X = np.meshgrid(four_y,four_x)
    FourXY = np.sqrt((Four_Y ** 2) + (Four_X**2))
    Left_Lobe = np.zeros(data_size,dtype=bool)
    RightLobe = np.zeros_like(Left_Lobe)
    
    #convert to CuPy arrays
    Four_Y = cp.asarray(Four_Y)
    Four_X = cp.asarray(Four_X)
    FourXY = cp.asarray(FourXY)
    Left_Lobe = cp.asarray(Left_Lobe)
    RightLobe = cp.asarray(RightLobe)
    rsize = cp.asarray((data_size[0],data_size[1]),dtype=int)
    
    #pass to JIT kernel
    lobe_calc(Left_Lobe,RightLobe,Four_Y,Four_X,FourXY,rsize,cutoff)
    
    data_phase = phase_cupy(processed4D)
    data_ampli = ampli_cupy(processed4D)
    left_trotter = cp.multiply(data_ampli[Left_Lobe],cp.exp((1j)*data_phase[Left_Lobe]))
    left_image = cp.asnumpy(cp.fft.ifft2(cp.sum(left_trotter,axis=-1)))
    righttrotter = cp.multiply(data_ampli[RightLobe],cp.exp((1j)*data_phase[RightLobe]))
    rightimage = cp.asnumpy(cp.fft.ifft2(cp.sum(righttrotter,axis=-1)))
    
    return left_image,right_image
Esempio n. 2
0
    def inverse_transform(self, array_in, array_out):
        """
        Perform the inverse Fourier transform of array_in,
        and store the result in array_out

        Parameters
        ----------
        array_in, array_out: cuda device arrays or numpy arrays
            When using the GPU, these should be cuda device array.
            When using the CPU, array_in should be one of the
            two buffers that are returned by `get_buffers`
        """
        if self.use_cuda:
            # Copy 2D arrays to 1D array for optimized 1D batch FFT
            cuda_copy_2d_to_1d[self.dim_grid, self.dim_block](array_in,
                                                              self.buffer1d_in)
            # Perform forward FFT
            self.fft.fft(self.buffer1d_in, self.buffer1d_out,
                         cufft.CUFFT_INVERSE)
            # Normalize inverse FFT
            cupy.multiply(self.buffer1d_out,
                          self.inv_Nz,
                          out=self.buffer1d_out)
            # Copy 1D arrays back to 2D array
            cuda_copy_1d_to_2d[self.dim_grid,
                               self.dim_block](self.buffer1d_out, array_out)
        elif self.use_mkl:
            # Perform the inverse FFT on the CPU using MKL
            self.mklfft.inverse_transform(array_in, array_out)
        else:
            # Perform the inverse FFT on the CPU using FFTW
            self.ifft.update_arrays(new_input_array=array_in,
                                    new_output_array=array_out)
            self.ifft()
Esempio n. 3
0
    def back_prop(self, dloss_dy):
        """
        Do backward propagation through the layer.

        Parameters
        ----------
        dloss_dy : cp.array of floats, shape (number of examples,) + self.output_size
            Derivative of loss with respect to output values.
            
        Returns
        -------
        cp.array of floats, shape (number of examples,) + self.input_size
            Derivative of loss with respect to input values.
        """

        # Keep track of all the dimensions.
        nr_examples = dloss_dy.shape[0]
        a, b, _ = self.input_size
        m, n, nr_channels = self.output_size
        p, q = self.pool_size

        # Expand the derivative to the input shape.
        dloss_dy_reshaped = dloss_dy.reshape(
            (nr_examples, m, 1, n, 1, nr_channels))
        dloss_dy_expanded = cp.multiply(
            dloss_dy_reshaped, cp.ones((1, 1, p, 1, q, 1), dtype=cp.int8))
        dloss_dy_expanded = dloss_dy_expanded.reshape(
            (nr_examples, a, b, nr_channels))

        # Apply the cached mask to the derivative.
        return cp.multiply(dloss_dy_expanded, self.i_cache)
 def _derivativenorm(self):
     """Compute the derivative of the norm
     Returns
     -------
     derivative : numpy array, shape (m_parameters,)
     """
     w2 = cp.reshape(self.w,(self.n_features,self.d,self.D,self.D))
     derivative = cp.zeros((self.n_features,self.d,self.D,self.D)) 
     
     tmp=cp.zeros((self.n_features,self.D))
     tmp2=cp.zeros((self.n_features,self.D))
     tmp[0,:]=cp.sum(cp.square(w2[0,:,0,:]),0)
     for i in range(1,self.n_features-1):
         tmp[i,:]=cp.dot(tmp[i-1,:],cp.sum(cp.square(w2[i,:,:,:]),0)) 
     tmp[self.n_features-1,:]=cp.inner(tmp[self.n_features-2,:],
             cp.sum(cp.square(w2[self.n_features-1,:,:,0]),0))
     tmp2[self.n_features-1,:]=cp.sum(cp.square(w2[self.n_features-1,:,:,0]),0)
     for i in range(self.n_features-2,-1,-1):
         tmp2[i,:]=cp.dot(cp.sum(cp.square(w2[i,:,:,:]),0),tmp2[i+1,:])
     tmp2[0,:]=cp.inner(cp.sum(cp.square(w2[0,:,0,:]),0),tmp2[1,:])
 
     for j in range(self.d):
         derivative[0,j,0,:]=cp.multiply(tmp2[1,:],2*(w2[0,j,0,:]))
         derivative[self.n_features-1,j,:,0]=\
             cp.multiply(tmp[self.n_features-2,:],2*(w2[self.n_features-1,j,:,0]))
     for i in range(1,self.n_features-1):
         temp3=cp.outer(tmp[i-1,:],tmp2[i+1,:])
         for j in range(self.d):
             derivative[i,j,:,:]=cp.multiply(temp3,2*(w2[i,j,:,:]))
     return derivative.reshape(self.m_parameters)
Esempio n. 5
0
def get_flat_dpc(data4D_flat, chunks=8, centered=True):
    stops = np.zeros(chunks + 1, dtype=np.int)
    stops[0:chunks] = np.arange(0, data4D_flat.shape[0],
                                (data4D_flat.shape[0] / chunks))
    stops[chunks] = data4D_flat.shape[0]
    if centered:
        cent_x = cp.asarray(data4D_flat.shape[2]) / 2
        cent_y = cp.asarray(data4D_flat.shape[1]) / 2
    else:
        CentralDisk = np.median(data4D_flat, axis=0)
        cent_x, cent_y, _ = st.util.sobel_circle(CentralDisk)
        cent_x = cp.asarray(cent_x)
        cent_y = cp.asarray(cent_y)
    yy, xx = cp.mgrid[0:data4D_flat.shape[1], 0:data4D_flat.shape[2]]
    FlatSum = cp.asarray(np.sum(data4D_flat, axis=(-1, -2)))
    YCom_CPU = np.zeros(data4D_flat.shape[0], dtype=data4D_flat.dtype)
    XCom_CPU = np.zeros(data4D_flat.shape[0], dtype=data4D_flat.dtype)
    for ii in range(chunks):
        startval = stops[ii]
        stop_val = stops[ii + 1]
        gpu_4Dchunk = cp.asarray(data4D_flat[startval:stop_val, :, :])
        FlatY = cp.multiply(gpu_4Dchunk, yy)
        FlatX = cp.multiply(gpu_4Dchunk, xx)
        YCom = (cp.sum(FlatY, axis=(-1, -2)) /
                FlatSum[startval:stop_val]) - cent_y
        XCom = (cp.sum(FlatX, axis=(-1, -2)) /
                FlatSum[startval:stop_val]) - cent_x
        YCom_CPU[startval:stop_val] = cp.asnumpy(YCom)
        XCom_CPU[startval:stop_val] = cp.asnumpy(XCom)
    del YCom, XCom, gpu_4Dchunk, cent_x, cent_y, FlatSum
    return YCom_CPU, XCom_CPU
def backward_pass(x, y, output, hidden_output, W_output):
    output_error = -(y - output)  # Calculate error
    output_over_net = output*(1 - output)  # Derivative of sigmoid function
    sigmoid_on_error = cp.multiply(output_error, output_over_net)  # Calculate the sigmoid function's affect on error

    W_output = cp.transpose(W_output)
    hidden_error = cp.dot(sigmoid_on_error, W_output)  # Calculate the affect of output weights on hidden weights' error
    hidden_over_net = hidden_output*(1 - hidden_output)  # Derivative of sigmoid function
    sigmoid_on_hidden_error = cp.multiply(hidden_error, hidden_over_net)  # Calculate the sigmoid function's affect on error

    # Correctly arrange matrices for calculations
    x = cp.atleast_2d(x)
    hidden_output = cp.atleast_2d(hidden_output)
    x_transpose = cp.transpose(x)
    hidden_output_transpose = cp.transpose(hidden_output)
    sigmoid_on_hidden_error = sigmoid_on_hidden_error.reshape(1, sigmoid_on_hidden_error.size)
    sigmoid_on_error = sigmoid_on_error.reshape(1, sigmoid_on_error.size)

    # Calculate weight changes
    W_hidden_c = cp.dot(x_transpose, sigmoid_on_hidden_error)
    W_output_c = cp.dot(hidden_output_transpose, sigmoid_on_error)

    # Calculate bias changes
    B_hidden_c = sigmoid_on_hidden_error
    B_output_c = sigmoid_on_error

    return W_output_c, W_hidden_c, B_hidden_c, B_output_c
Esempio n. 7
0
def normal(loc=0.0, scale=1.0, size=None, dtype=float):
    """Returns an array of normally distributed samples.

    Args:
        loc (float or array_like of floats): Mean of the normal distribution.
        scale (float or array_like of floats):
            Standard deviation of the normal distribution.
        size (int or tuple of ints): The shape of the array. If ``None``, a
            zero-dimensional array is generated.
        dtype: Data type specifier. Only :class:`numpy.float32` and
            :class:`numpy.float64` types are allowed.

    Returns:
        cupy.ndarray: Normally distributed samples.

    .. seealso:: :func:`numpy.random.normal`

    """
    rs = _generator.get_random_state()
    if size is None and any(isinstance(arg, cupy.ndarray)
                            for arg in [scale, loc]):
        size = cupy.broadcast_arrays(loc, scale)[0].shape
    x = rs.normal(0, 1, size, dtype)
    cupy.multiply(x, scale, out=x)
    cupy.add(x, loc, out=x)
    return x
Esempio n. 8
0
    def normal(self, loc=0.0, scale=1.0, size=None, dtype=float):
        """Returns an array of normally distributed samples.

        .. seealso::
            - :func:`cupy.random.normal` for full documentation
            - :meth:`numpy.random.RandomState.normal`

        """
        dtype = _check_and_get_dtype(dtype)
        if size is None:
            size = cupy.broadcast(loc, scale).shape
        if dtype.char == 'f':
            func = curand.generateNormal
        else:
            func = curand.generateNormalDouble
        if isinstance(scale, cupy.ndarray):
            x = self._generate_normal(func, size, dtype, 0.0, 1.0)
            cupy.multiply(x, scale, out=x)
            cupy.add(x, loc, out=x)
        elif isinstance(loc, cupy.ndarray):
            x = self._generate_normal(func, size, dtype, 0.0, scale)
            cupy.add(x, loc, out=x)
        else:
            x = self._generate_normal(func, size, dtype, loc, scale)
        return x
def calculate_loss_modified(prediction, y):
    prediction[prediction == 0] = 0.00000001
    y[y == 0] = 0.00000001
    lossExpression = -cp.sum(
        cp.multiply(y, cp.log(prediction)) + cp.multiply(
            cp.ones(y.shape) - y, cp.log(cp.ones(y.shape) - prediction)))
    return lossExpression
Esempio n. 10
0
 def __call__(self, params, g_params):
     new_params = tuple(
         cp.add(
             param,
             cp.subtract(cp.multiply(self.momentum, cp.subtract(param, v)),
                         cp.multiply(self.rate, g_param)))
         for param, g_param, v in zip(params, g_params, self.v))
     self.v = params
     return new_params
Esempio n. 11
0
 def __iteration(self, gamma: cp.float64):
     """
     Iteration of the inner loop of the (iterated) Tikhonov method.
     :param gamma: Regularization parameter of the Tikhonov algorithm.
     :type gamma: float
     :return: Numpy array with the solution in given iteration.
     """
     LU, P = linalg.lu_factor(cp.add(self.KHK, cp.multiply(gamma, self.identity)))
     self.current = linalg.lu_solve((LU, P), cp.add(self.q_estimator, cp.multiply(gamma, self.previous)))
Esempio n. 12
0
    def gpu_gaussian(self, a, b, s):
        km = cp.empty(shape=(a.shape[0], b.shape[0]), dtype=a.dtype)
        km = cp.multiply(cp.dot(a, b.T, out=km), -2, out=km)
        km += cp.power(a, 2).sum(axis=1).reshape(-1, 1)
        km += cp.power(b, 2).sum(axis=1)

        cp.multiply(km, -1 / (2 * s * s), out=km)
        cp.exp(km, out=km)
        return km
Esempio n. 13
0
def pulse_compression(x, template, normalize=False, window=None, nfft=None):
    """
    Pulse Compression is used to increase the range resolution and SNR
    by performing matched filtering of the transmitted pulse (template)
    with the received signal (x)

    Parameters
    ----------
    x : ndarray
        Received signal, assume 2D array with [num_pulses, sample_per_pulse]

    template : ndarray
        Transmitted signal, assume 1D array

    normalize : bool
        Normalize transmitted signal

    window : array_like, callable, string, float, or tuple, optional
        Specifies the window applied to the signal in the Fourier
        domain.

    nfft : int, size of FFT for pulse compression. Default is number of
        samples per pulse

    Returns
    -------
    compressedIQ : ndarray
        Pulse compressed output
    """
    [num_pulses, samples_per_pulse] = x.shape

    if nfft is None:
        nfft = samples_per_pulse

    if window is not None:
        Nx = len(template)
        if callable(window):
            W = window(cp.fft.fftfreq(Nx))
        elif isinstance(window, cp.ndarray):
            if window.shape != (Nx, ):
                raise ValueError("window must have the same length as data")
            W = window
        else:
            W = get_window(window, Nx, False)

        template = cp.multiply(template, W)

    if normalize is True:
        template = cp.divide(template, cp.linalg.norm(template))

    fft_x = cp.fft.fft(x, nfft)
    fft_template = cp.conj(cp.tile(cp.fft.fft(template, nfft),
                                   (num_pulses, 1)))
    compressedIQ = cp.fft.ifft(cp.multiply(fft_x, fft_template), nfft)

    return compressedIQ
Esempio n. 14
0
 def feedforward(self, input):
     self.input = input
     self.ft = sigmoid(self.Wf@input + [email protected]_ht + self.bf)  # forget gate
     self.it = sigmoid(self.Wi@input + [email protected]_ht + self.bi)  # update gate
     self.ot = sigmoid(self.Wo@input + [email protected]_ht + self.bo)  # output gate
     self.ct_bar = tanh(self.Wc @ input + self.Uc @ self.prev_ht + self.bc)
     # outputs
     self.ct = cp.multiply(self.ft, self.prev_ct) + cp.multiply(self.it, self.ct_bar)
     self.ht = cp.multiply(self.ot, tanh(self.ct))
     return self.ct, self.ht
Esempio n. 15
0
 def update_reservoir(self, u, n, Y):
     # u is input at specific time
     #   u has shape (N_u (3 for L63))
     # See page 16 eqtn 18 of Lukosevicius PracticalESN for feedback info.
     x_n_tilde = cp.tanh(
         cp.matmul(self.W, self.x[n]) +
         cp.array(sp.matmul(self.W_in, sp.hstack((sp.array([1]), u)))) +
         cp.array(sp.matmul(self.W_fb, Y)))
     self.x[n+1] = cp.multiply((1-cp.array(self.alpha_matrix)), cp.array(self.x[n])) \
           + cp.multiply(cp.array(self.alpha_matrix), x_n_tilde)
Esempio n. 16
0
 def diffuse_slime_trail():
     cp.multiply(
         SlimeWorld.cells,
         SlimeWorld.trail_reduction_factor,
         out=SlimeWorld.cells,
         casting="unsafe",
     )
     convolve(SlimeWorld.cells,
              SlimeWorld.trail_kernel,
              output=SlimeWorld.cells)
Esempio n. 17
0
def error_minimization(W,
                       b,
                       zeta,
                       a,
                       prev_layer,
                       activation_func,
                       den_activation,
                       y,
                       w=None,
                       d=None,
                       y_pred=None):
    dW = {}
    dB = {}
    delta = {}
    try:
        batch_size = y.shape[1]
    except IndexError:
        batch_size = 1
        y = cp.reshape(y, (y.shape[0], batch_size))

    is_last_layer = (type(w) == type(d)) and (type(d) == type(None))

    if is_last_layer:

        delta['s'] = cp.subtract(a['s'], y)
        dB['s'] = (1 / batch_size) * cp.sum(delta['s'], axis=1)
        dB['s'] = cp.reshape(dB['s'], (dB['s'].shape[0], 1, 1))

        delta['s'] = cp.reshape(delta['s'],
                                (delta['s'].shape[0], 1, delta['s'].shape[1]))

        dW['s'] = (1 / batch_size) * cp.einsum('nik,kjn->nij', delta['s'],
                                               a['d'].T)

    else:
        w = cp.array(w)

        deltaW = cp.einsum('nik,kij->nj', w.T, d)
        deltaW = cp.reshape(deltaW, (deltaW.shape[0], 1, deltaW.shape[1]))
        a_der = activation(str(activation_func) + '_der', zeta['s'])

        delta['s'] = cp.multiply(deltaW, a_der)
        dB['s'] = (1 / batch_size) * cp.sum(delta['s'].squeeze(), axis=1)
        dB['s'] = cp.reshape(dB['s'], (dB['s'].shape[0], 1, 1))
        dW['s'] = (1 / batch_size) * cp.einsum('nik,kjn->nij', delta['s'],
                                               a['d'].T)

    deltaW = cp.einsum('nik,kij->knj', W['s'].T, delta['s'])
    a_der = activation(den_activation + '_der', zeta['d'])
    delta['d'] = cp.multiply(deltaW, a_der)
    dB['d'] = (1 / batch_size) * cp.sum(delta['d'], axis=2)
    dB['d'] = cp.reshape(dB['d'], (dB['d'].shape[0], dB['d'].shape[1], 1))
    dW['d'] = (1 / batch_size) * cp.dot(delta['d'], prev_layer.T)
    return [dW, dB, delta]
Esempio n. 18
0
 def func(a, t, params, A, function, bT, x, division):
     index = int(t * (division - 1))
     return cp.multiply(
         -1.,
         cp.add(
             cp.dot(a, params[1][index]),
             cp.dot(
                 cp.multiply(
                     bT,
                     cp.multiply(params[0][index],
                                 function(cp.dot(x[index], A.T)))), A)))
Esempio n. 19
0
    def dyad_transform(self, grids):
        """
        Experimental: compute spectrum of the dyad v_i * v_j
        """
        k_xx = grids.fourier_transform(function=cp.multiply(self.arr[0, 1:-1, :, 1:-1, :],
                                                            self.arr[0, 1:-1, :, 1:-1, :]))
        k_xy = grids.fourier_transform(function=cp.multiply(self.arr[0, 1:-1, :, 1:-1, :],
                                                            self.arr[1, 1:-1, :, 1:-1, :]))
        k_yy = grids.fourier_transform(function=cp.multiply(self.arr[1, 1:-1, :, 1:-1, :],
                                                            self.arr[1, 1:-1, :, 1:-1, :]))

        self.dyad_spectrum = cp.array([[k_xx, k_xy], [k_xy, k_yy]])
Esempio n. 20
0
def calcDistField(point_file, h5name, save_location):
    data_file = h5py.File(h5name)
    data = data_file['data'][:]
    data_dim = data.shape[0]
    data_file.close()
    ptfile = h5py.File(point_file)
    sample_points = ptfile['points'][:]
    ptfile.close()
    sample_size = sample_points.shape[0]

    #gpu parallelization
    memory_pool = cupy.get_default_memory_pool()
    pinned_memory_pool = cupy.get_default_pinned_memory_pool()

    distancesgpu = numpy.zeros((data_dim, data.shape[1], sample_size))
    x = cupy.asarray(sample_points)
    allpts = cupy.tile(x, (data.shape[1], 1))
    blocks = int(numpy.ceil(sample_size * data.shape[1] / 8192))
    del x
    print(blocks)
    yy = cupy.asarray(data)
    for inst in range(data_dim):
        if inst % 200 == 0:
            print(inst)
        y = yy[inst]

        xx = allpts + cupy.tile(y, (1, sample_size)).reshape(-1, 3)
        xdot = cupy.sum(cupy.multiply(xx, xx), axis=1)
        dt = cupy.zeros((sample_size * data.shape[1], ))
        for blk in range(blocks):
            idstart = int(blk * 8192)
            idend = int((blk + 1) * 8192)

            dists = cupy.tile(xdot[idstart:idend], (y.shape[0], 1)).transpose(
            ) - 2 * cupy.matmul(xx[idstart:idend], y.transpose()) + cupy.tile(
                cupy.sum(cupy.multiply(y, y), axis=1).transpose(),
                (xx[idstart:idend].shape[0], 1))
            dt[idstart:idend] = cupy.amin(dists, axis=1)
            del dists
        dt = cupy.reshape(dt, (-1, sample_size))
        distancesgpu[inst] = cupy.asnumpy(dt)
        del dt
        del xx
        del xdot
    memory_pool.free_all_blocks()
    pinned_memory_pool.free_all_blocks()
    # save file
    saveh5 = h5py.File(save_location, 'w')
    saveh5.create_dataset('distances', data=distancesgpu)
    saveh5.close()
Esempio n. 21
0
 def _preprocess(self, frame):
     frame_dev = cp.asarray(frame)
     # resize
     zoom = np.roll(self.inp_handle.shape, -1) / frame_dev.shape
     small_dev = cupyx.scipy.ndimage.zoom(frame_dev,
                                          zoom,
                                          order=1,
                                          mode='opencv',
                                          grid_mode=True)
     # BGR to RGB
     rgb_dev = small_dev[..., ::-1]
     # HWC -> CHW
     chw_dev = rgb_dev.transpose(2, 0, 1)
     # normalize to [0, 1] interval
     cp.multiply(chw_dev, 1 / 255., out=self.inp_handle)
Esempio n. 22
0
 def do_rmsprop(self, X, Y, update, learning_rate, **kwargs):
     layers = len(self.structure) - 1
     grads = self.calculate_grads(X, Y, kwargs["l2_reg_param"])
     for ii in cp.arange(1, layers + 1):
         update["w" + str(ii)] = kwargs["beta"] * update.get(
             "w" + str(ii), 0) + (1 - kwargs["beta"]) * cp.square(
                 cp.sum(grads["w" + str(ii)], axis=0))
         update["b" + str(ii)] = kwargs["beta"] * update.get(
             "b" + str(ii), 0) + (1 - kwargs["beta"]) * cp.square(
                 cp.sum(grads["b" + str(ii)], axis=1).reshape(-1, 1))
         self.params["w"+str(ii)] -= cp.multiply((learning_rate/ cp.sqrt(kwargs["epsilon"] + update["w"+str(ii)])),\
                                                 cp.sum(grads["w"+str(ii)],axis=0))
         self.params["b"+str(ii)] -= cp.multiply((learning_rate / cp.sqrt(kwargs["epsilon"] + update["b"+str(ii)])),\
                                                 cp.sum(grads["b"+str(ii)],axis=1).reshape(-1,1))
     return update
Esempio n. 23
0
def calcMSeries(t_axis, h, t_0, m_0, noiseFlag = False):
    if noiseFlag:
        # RK4 solution with thermal noise
        m_rk4 = []
        m_prev = m_0
        t_prev = t_0
        
        H_eff = cp.array([0, 0, float(H_k*m_0[2])]) + cp.multiply(demag_const, m_0) + cp.array([thermalConst*cp.random.normal(0,1),
                                                                                                thermalConst*cp.random.normal(0,1),
                                                                                                thermalConst*cp.random.normal(0,1)])
        for t in t_axis:
            print("#################################################################")
            print("step t:"+str(t))
            new_m = llgsRK4Heun(t_prev, m_prev, t, h, H_eff) 
            print("new m:"+str(new_m))
            m_rk4.append(new_m)
            m_prev = new_m
            t_prev = t
            H_eff = cp.array([0, 0, float(H_k*new_m[2])]) + cp.multiply(demag_const, new_m) + cp.array([thermalConst*cp.random.normal(0,1),
                                                                                                        thermalConst*cp.random.normal(0,1),
                                                                                                        thermalConst*cp.random.normal(0,1)])
            print("new H_eff:"+str(H_eff))

    else:
        # RK4 solution without thermal noise
        m_rk4 = []
        m_prev = m_0
        t_prev = t_0

        H_eff = cp.array([0, 0, float(H_k*m_0[2])]) + cp.multiply(demag_const, m_0)
        for t in t_axis:
            print("#################################################################")
            print("step t:"+str(t))
            new_m = llgsRK(t_prev, m_prev, t, h, H_eff)
            print("new m:"+str(new_m))
            m_rk4.append(new_m)
            m_prev = new_m
            t_prev = t
            H_eff = cp.array([0, 0, float(H_k*new_m[2])]) + cp.multiply(demag_const, new_m)
            print("new H_eff:"+str(H_eff))

    # change all elements to list and float
    for i in range(0, len(m_rk4)):
        m_rk4[i] = m_rk4[i].tolist()
        for j in range(0, len(m_rk4[i])):
            m_rk4[i][j] = float(m_rk4[i][j])

    return m_rk4
Esempio n. 24
0
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
    """Returns the variance along an axis.

    Args:
        a (cupy.ndarray): Array to compute variance.
        axis (int): Along which axis to compute variance. The flattened array
            is used by default.
        dtype: Data type specifier.
        out (cupy.ndarray): Output array.
        keepdims (bool): If True, the axis is remained as an axis of size one.

    Returns:
        cupy.ndarray: The variance of the input array along the axis.

    .. seealso:: :func:`numpy.var`

    """
    if axis is None:
        axis = tuple(range(a.ndim))
    if not isinstance(axis, tuple):
        axis = (axis,)

    if dtype is None and issubclass(a.dtype.type,
                                    (numpy.integer, numpy.bool_)):
        dtype = numpy.dtype(numpy.float64)

    arrmean = mean(a, axis=axis, dtype=dtype, keepdims=True)

    x = cupy.subtract(a, arrmean, dtype=dtype)
    cupy.square(x, x)
    ret = cupy.sum(x, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
    rcount = max(_count_reduce_items(a, axis) - ddof, 0)
    return cupy.multiply(ret, ret.dtype.type(1.0 / rcount), out=ret)
Esempio n. 25
0
def GlobalReg(X, T, sigma2, outliers):
    """
    :params:
    :return
    """
    [N, D] = X.shape
    M = T.shape[0]

    # Calculate P matrix
    # Nominator of P
    P_num = cp.sum((X[None, :, :] - T[:, None, :])**2, axis=2)
    P_num = cp.exp(-P_num / (2 * sigma2))
    # Denominator of P
    P_den = cp.sum(P_num, axis=0)
    P_den = cp.tile(P_den, (M, 1))
    P_den[P_den == 0] = 2.220446049250313e-16
    c = ((((2 * cp.pi * sigma2)**D / 2) * (outliers / (1 - outliers))) *
         (M / N))
    P_den += c

    P = cp.divide(P_num, P_den)

    P1 = cp.sum(P, axis=1)
    Pt1 = cp.sum(P, axis=0)

    c1 = c * cp.ones(N)
    K1 = cp.dot(cp.transpose(P_num), cp.ones(M))
    a = cp.tile(cp.divide(1, K1 + c1).reshape(N, 1), D)
    Px = cp.dot(P_num, (cp.multiply(a, X)))

    return P1, Pt1, Px
Esempio n. 26
0
    def step(self, model, solution, old_solution, *args, **kwargs):
        """Correlation coefficient at current step.

        Args:
            model (tomomak.Model): used model.
            solution (ndarray): supposed solution.
            old_solution (ndarray): supposed_solution at a previous iteration.
             *args: not used, but needed to be here in order to work with Solver properly.
             **kwargs: not used, but needed to be here in order to work with Solver properly.

        Returns:
            float: correlation coefficient.
        """
        det_num = model.detector_signal.shape[0]
        det_num2 = det_num**2
        f_s = cp.sum(old_solution)
        f_new_s = cp.sum(solution)
        corr = det_num2 * cp.sum(np.multiply(solution, old_solution))
        corr = corr - f_s * f_new_s
        divider = det_num2 * cp.sum(np.multiply(solution, solution))
        tmp = f_new_s**2
        divider = cp.sqrt(divider - tmp)
        corr = corr / divider
        divider = det_num2 * cp.sum(cp.multiply(old_solution, old_solution))
        tmp = f_s**2
        divider = cp.sqrt(divider - tmp)
        if divider:
            res = corr / divider
        else:
            res = np.nan
        self.data.append(res)
        self.data.append(res)
        return res
Esempio n. 27
0
def kron(a, b):
    """Returns the kronecker product of two arrays.

    Args:
        a (~cupy.ndarray): The first argument.
        b (~cupy.ndarray): The second argument.

    Returns:
        ~cupy.ndarray: Output array.

    .. seealso:: :func:`numpy.kron`

    """
    a_ndim = a.ndim
    b_ndim = b.ndim
    if a_ndim == 0 or b_ndim == 0:
        return cupy.multiply(a, b)

    ndim = b_ndim
    a_shape = a.shape
    b_shape = b.shape
    if a_ndim != b_ndim:
        if b_ndim > a_ndim:
            a_shape = (1,) * (b_ndim - a_ndim) + a_shape
        else:
            b_shape = (1,) * (a_ndim - b_ndim) + b_shape
            ndim = a_ndim

    axis = ndim - 1
    out = core.tensordot_core(a, b, None, a.size, b.size, 1, a_shape + b_shape)
    for _ in six.moves.range(ndim):
        out = core.concatenate_method(out, axis=axis)

    return out
Esempio n. 28
0
 def backward(self, dA):
     """Implementation of backward pooling using stride tricks.
     Args:
         dA (np.array): derivative of output values
     Returns:
         np.array: derivative of intput values
     """
     if len(dA.shape) == 2:
         dA = dA.reshape(dA.shape[1], *self.dim_out[1:])
     self.dX[:, :, :, :] = 0
     n_h = self.dim_out[1]
     n_w = self.dim_out[2]
     shape = (
         self.X.shape[0],  # m
         n_h,
         n_w,
         self.f,
         self.f,
         self.X.shape[-1])  # n_c
     strides = (self.X.strides[0], self.X.strides[1] * self.stride,
                self.X.strides[2] * self.stride, self.X.strides[1],
                self.X.strides[2], self.X.strides[3])
     M = np.lib.stride_tricks.as_strided(
         self.X, shape=shape, strides=strides)  # , writeable=False)
     # dangerous: writing into memory, don't mess up strides !
     M_dX = np.lib.stride_tricks.as_strided(
         self.dX, shape=shape, strides=strides)  # , writeable=True)
     mask = np.max(M, axis=(-3, -2), keepdims=True) == M
     M_dX += np.multiply(mask, dA[:, :, :, None, None])
     return self.dX
Esempio n. 29
0
 def fit_dropout(self, epochs=1, batch_size=1, p=0.5, gamma=0.9, **args):
     X = args['X_train']
     y = args['y_train']
     if 'verbose' in args:
         verbose = args['verbose']
     else:
         verbose = None
     loss_val = cp.zeros((cp.int(epochs)))
     par_gpu = deepcopy(self.start)
     momemtum = {var: cp.zeros_like(par_gpu[var]) for var in par_gpu.keys()}
     for i in range(int(epochs)):
         for batch in self.iterate_minibatches(X, y, batch_size):
             X_batch, y_batch = batch
             Z = cp.random.binomial(1, p, size=X_batch.shape)
             X_batch_dropout = cp.multiply(X_batch, Z)
             grad_p = self.model.grad(par_gpu,
                                      X_train=X_batch_dropout,
                                      y_train=y_batch)
             for var in par_gpu.keys():
                 momemtum[var] = gamma * momemtum[
                     var] + -self.step_size * grad_p[var]
                 par_gpu[var] += momemtum[var]
         loss_val[i] = self.model.negative_log_posterior(par_gpu,
                                                         X_train=X_batch,
                                                         y_train=y_batch)
         if verbose and (i % (epochs / 10) == 0):
             print('loss: {0:.4f}'.format(cp.asnumpy(loss_val[i])))
     return par_gpu, loss_val
Esempio n. 30
0
def get_square_sampling_probas(attractivity_cells,
                               square_ids_cells,
                               coords_squares,
                               dscale=1):
    # compute sum attractivities in squares
    sum_attractivity_squares, unique_squares = sum_by_group(
        values=attractivity_cells, groups=square_ids_cells)
    # Compute distances between all squares and squares having sum_attractivity > 0
    mask_attractivity = (sum_attractivity_squares > 0)
    eligible_squares = unique_squares[mask_attractivity]
    sum_attractivity_squares = sum_attractivity_squares[mask_attractivity]

    # Compute distance between cells, add `intra_square_dist` for average intra cell distance
    inter_square_dists = cdist(coords_squares).astype(cp.float32)
    inter_square_dists = inter_square_dists[:, eligible_squares]
    square_sampling_probas = cp.multiply(inter_square_dists, -dscale)
    square_sampling_probas = cp.exp(square_sampling_probas)
    square_sampling_probas *= sum_attractivity_squares[
        None, :]  # row-wise multiplication
    square_sampling_probas /= cp.linalg.norm(square_sampling_probas,
                                             ord=1,
                                             axis=1,
                                             keepdims=True)
    square_sampling_probas = square_sampling_probas.astype(cp.float32)
    return square_sampling_probas
Esempio n. 31
0
def inner(a, b):
    """Returns the inner product of two arrays.

    It uses the last axis of each argument to take sum product.

    Args:
        a (cupy.ndarray): The first argument.
        b (cupy.ndarray): The second argument.

    Returns:
        cupy.ndarray: The inner product of ``a`` and ``b``.

    .. seealso:: :func:`numpy.inner`

    """
    a_ndim = a.ndim
    b_ndim = b.ndim
    if a_ndim == 0 or b_ndim == 0:
        return cupy.multiply(a, b)

    a_axis = a_ndim - 1
    b_axis = b_ndim - 1

    if a.shape[-1] != b.shape[-1]:
        raise ValueError('Axis dimension mismatch')

    if a_axis:
        a = cupy.rollaxis(a, a_axis, 0)
    if b_axis:
        b = cupy.rollaxis(b, b_axis, 0)

    ret_shape = a.shape[1:] + b.shape[1:]

    k = a.shape[0]
    n = a.size // k
    m = b.size // k

    return _tensordot_core(a, b, None, n, m, k, ret_shape)
Esempio n. 32
0
    def train(self):
        # clear grads
        self.q_func.zerograds()

        # pull tuples from memory pool
        batch_tuples = self.replay.pull(Config.batch_size)
        if not len(batch_tuples):
            return

        # stack inputs
        cur_x = [self.env.getX(t.state) for t in batch_tuples]
        next_x = [self.env.getX(t.next_state) for t in batch_tuples]
        # merge inputs into one array
        if Config.gpu:
            cur_x = [cupy.expand_dims(t, 0) for t in cur_x]
            cur_x = cupy.concatenate(cur_x, 0)
            next_x = [cupy.expand_dims(t, 0) for t in next_x]
            next_x = cupy.concatenate(next_x, 0)
        else:
            cur_x = np.stack(cur_x)
            next_x = np.stack(next_x)

        # get cur outputs
        cur_output = self.QFunc(self.q_func, cur_x)
        # get next outputs, NOT target
        next_output = self.QFunc(self.q_func, next_x)
        # choose next action for each output
        next_action = [
            self.env.getBestAction(
                o.data,
                [t.next_state for t in batch_tuples]
            ) for o in next_output  # for each head in Model
        ]
        # get next outputs, target
        next_output = self.QFunc(self.target_q_func, next_x)

        # clear err of tuples
        for t in batch_tuples:
            t.err = 0.
        # store err count
        err_count_list = [0.] * len(batch_tuples)

        # compute grad's weights
        weights = np.array([t.P for t in batch_tuples], np.float32)
        if Config.gpu:
            weights = cuda.to_gpu(weights)
        if self.replay.getPoolSize():
            weights *= self.replay.getPoolSize()
        weights = weights ** -Config.beta
        weights /= weights.max()
        if Config.gpu:
            weights = cupy.expand_dims(weights, 1)
        else:
            weights = np.expand_dims(weights, 1)

        # update beta
        Config.beta = min(1, Config.beta + Config.beta_add)

        # compute grad for each head
        for k in range(Config.K):
            if Config.gpu:
                cur_output[k].grad = cupy.zeros_like(cur_output[k].data)
            else:
                cur_output[k].grad = np.zeros_like(cur_output[k].data)
            # compute grad from each tuples
            for i in range(len(batch_tuples)):
                if batch_tuples[i].mask[k]:
                    cur_action_value = \
                        cur_output[k].data[i][batch_tuples[i].action].tolist()
                    reward = batch_tuples[i].reward
                    next_action_value = \
                        next_output[k].data[i][next_action[k][i]].tolist()
                    target_value = reward
                    # if not empty position, not terminal state
                    if batch_tuples[i].next_state.in_game:
                        target_value += Config.gamma * next_action_value
                    loss = cur_action_value - target_value
                    cur_output[k].grad[i][batch_tuples[i].action] = 2 * loss
                    # count err
                    if cur_action_value:
                        batch_tuples[i].err += abs(loss / cur_action_value)
                        err_count_list[i] += 1

            # multiply weights with grad and clip
            if Config.gpu:
                cur_output[k].grad = cupy.multiply(
                    cur_output[k].grad, weights)
                cur_output[k].grad = cupy.clip(cur_output[k].grad, -1, 1)
            else:
                cur_output[k].grad = np.multiply(
                    cur_output[k].grad, weights)
                cur_output[k].grad = np.clip(cur_output[k].grad, -1, 1)
            # backward
            cur_output[k].backward()

        # adjust grads of shared
        for param in self.q_func.shared.params():
            param.grad /= Config.K

        # update params
        self.optimizer.update()

        # avg err
        for i in range(len(batch_tuples)):
            if err_count_list[i] > 0:
                batch_tuples[i].err /= err_count_list[i]

        self.replay.merge(Config.alpha)

        return np.mean([t.err for t in batch_tuples])
Esempio n. 33
0
def tensordot(a, b, axes=2):
    """Returns the tensor dot product of two arrays along specified axes.

    This is equivalent to compute dot product along the specified axes which
    are treated as one axis by reshaping.

    Args:
        a (cupy.ndarray): The first argument.
        b (cupy.ndarray): The second argument.
        axes:
            - If it is an integer, then ``axes`` axes at the last of ``a`` and
              the first of ``b`` are used.
            - If it is a pair of sequences of integers, then these two
              sequences specify the list of axes for ``a`` and ``b``. The
              corresponding axes are paired for sum-product.
        out (cupy.ndarray): Output array.

    Returns:
        cupy.ndarray: The tensor dot product of ``a`` and ``b`` along the
        axes specified by ``axes``.

    .. seealso:: :func:`numpy.tensordot`

    """
    a_ndim = a.ndim
    b_ndim = b.ndim
    if a_ndim == 0 or b_ndim == 0:
        if axes != 0 and axes != ((), ()):
            raise ValueError('An input is zero-dim while axes has dimensions')
        return cupy.multiply(a, b)

    if isinstance(axes, collections.Sequence):
        if len(axes) != 2:
            raise ValueError('Axes must consist of two arrays.')
        a_axes, b_axes = axes
        if numpy.isscalar(a_axes):
            a_axes = a_axes,
        if numpy.isscalar(b_axes):
            b_axes = b_axes,
    else:
        a_axes = tuple(six.moves.range(a_ndim - axes, a_ndim))
        b_axes = tuple(six.moves.range(axes))

    sum_ndim = len(a_axes)
    if sum_ndim != len(b_axes):
        raise ValueError('Axes length mismatch')

    for a_axis, b_axis in zip(a_axes, b_axes):
        if a.shape[a_axis] != b.shape[b_axis]:
            raise ValueError('Axis dimension mismatch')

    # Make the axes non-negative
    a = _move_axes_to_head(a, [axis % a_ndim for axis in a_axes])
    b = _move_axes_to_head(b, [axis % b_ndim for axis in b_axes])

    ret_shape = a.shape[sum_ndim:] + b.shape[sum_ndim:]

    k = internal.prod(a.shape[:sum_ndim])
    n = a.size // k
    m = b.size // k

    return _tensordot_core(a, b, None, n, m, k, ret_shape)
Esempio n. 34
0
def _tensordot_core(a, b, out, n, m, k, ret_shape):
    ret_dtype = a.dtype.char
    if ret_dtype != b.dtype.char:
        ret_dtype = numpy.find_common_type((ret_dtype, b.dtype), ()).char

    # Cast to float32 or float64
    if ret_dtype == 'f' or ret_dtype == 'd':
        dtype = ret_dtype
    else:
        dtype = numpy.find_common_type((ret_dtype, 'f'), ()).char

    a = a.astype(dtype, copy=False)
    b = b.astype(dtype, copy=False)

    if not a.size or not b.size:
        if a.size or b.size:
            raise ValueError('cannot dot zero-sized and non-zero-sized arrays')
        if out is None:
            return cupy.zeros(ret_shape, dtype=ret_dtype)
        else:
            out.fill(0)
            return out

    if out is None:
        out = cupy.empty(ret_shape, dtype)
        if dtype == ret_dtype:
            ret = out
        else:
            ret = cupy.empty(ret_shape, ret_dtype)
    else:
        ret = out
        if out.dtype != dtype:
            out = cupy.empty(ret_shape, dtype)

    # It copies the operands if needed
    if a.shape != (k, n):
        a = cupy.reshape(a, (k, n))
    if b.shape != (k, m):
        b = cupy.reshape(b, (k, m))
    c = out
    if c.shape != (n, m):
        c = c.view()
        c.shape = (n, m)

    # Be careful that cuBLAS uses the FORTRAN-order matrix representation.
    if k == 1:
        if n == 1:
            # Scalar-vector product
            cupy.multiply(a, b, c)
        elif m == 1:
            # Scalar-vector product
            cupy.multiply(a.T, b, c)
        else:
            # Outer product A^T * B
            # c is C-contiguous while cuBLAS requires F-contiguous arrays, so
            # we compute C^T = B^T * A here.
            handle = cuda.Device().cublas_handle
            c.fill(0)
            a, inca = _to_cublas_vector(a, 1)
            b, incb = _to_cublas_vector(b, 1)
            if dtype == 'f':
                ger = cublas.sger
            elif dtype == 'd':
                ger = cublas.dger
            ger(handle, m, n, 1, b.data.ptr, incb, a.data.ptr, inca,
                c.data.ptr, m)

        if dtype != ret_dtype:
            elementwise.copy(out, ret)
        return ret

    handle = cuda.Device().cublas_handle
    if n == 1:
        if m == 1:
            # Inner product
            a, inca = _to_cublas_vector(a, 0)
            b, incb = _to_cublas_vector(b, 0)
            mode = cublas.getPointerMode(handle)
            cublas.setPointerMode(handle,
                                  cublas.CUBLAS_POINTER_MODE_DEVICE)
            if dtype == 'f':
                dot = cublas.sdot
            elif dtype == 'd':
                dot = cublas.ddot
            try:
                dot(handle, k, a.data.ptr, inca, b.data.ptr, incb, c.data.ptr)
            finally:
                cublas.setPointerMode(handle, mode)
        else:
            # Matrix-vector product B^T * A
            a, inca = _to_cublas_vector(a, 0)
            b, transb, ldb = _mat_to_cublas_contiguous(b, 1)
            if transb:
                # gemv requires (m, k) as the original matrix dimensions
                # rather than the transposed dimensions.
                m, k = k, m
            if dtype == 'f':
                gemv = cublas.sgemv
            elif dtype == 'd':
                gemv = cublas.dgemv
            gemv(handle, transb, m, k, 1, b.data.ptr, ldb, a.data.ptr, inca,
                 0, c.data.ptr, 1)
    elif m == 1:
        # Matrix-vector product A^T * B
        a, transa, lda = _mat_to_cublas_contiguous(a, 1)
        b, incb = _to_cublas_vector(b, 0)
        if transa:
            # gemv requires (n, k) as the original matrix dimensions rather
            # than the transposed dimensions.
            n, k = k, n
        if dtype == 'f':
            gemv = cublas.sgemv
        elif dtype == 'd':
            gemv = cublas.dgemv
        gemv(handle, transa, n, k, 1, a.data.ptr, lda, b.data.ptr, incb, 0,
             c.data.ptr, 1)
    else:
        # Matrix-Matrix product A^T * B
        # c is C-contiguous while cuBLAS assumes F-contiguous inputs, so we
        # compute C^T = B^T * A here.
        a, transa, lda = _mat_to_cublas_contiguous(a, 0)
        b, transb, ldb = _mat_to_cublas_contiguous(b, 1)
        if dtype == 'f':
            gemm = cublas.sgemm
        elif dtype == 'd':
            gemm = cublas.dgemm
        gemm(handle, transb, transa, m, n, k, 1, b.data.ptr, ldb, a.data.ptr,
             lda, 0, c.data.ptr, m)

    if dtype != ret_dtype:
        elementwise.copy(out, ret)
    return ret
Esempio n. 35
0
def tensordot(a, b, axes=2, out=None):
    """Returns the tensor dot product of two arrays along specified axes.

    This is equivalent to compute dot product along the specified axes which
    are treated as one axis by reshaping.

    Args:
        a (cupy.ndarray): The first argument.
        b (cupy.ndarray): The second argument.
        axes:
            - If it is an integer, then ``axes`` axes at the last of ``a`` and
              the first of ``b`` are used.
            - If it is a pair of sequences of integers, then these two
              sequences specify the list of axes for ``a`` and ``b``. The
              corresponding axes are paired for sum-product.
        out (cupy.ndarray): Output array.

    Returns:
        cupy.ndarray: The tensor dot product of ``a`` and ``b`` along the
        axes specified by ``axes``.

    .. seealso:: :func:`numpy.tensordot`

    """
    if a.ndim == 0 or b.ndim == 0:
        if axes != 0 and axes != ((), ()):
            raise ValueError('An input is zero-dim while axes has dimensions')
        return cupy.multiply(a, b, out=out)

    ret_dtype = numpy.find_common_type([a.dtype, b.dtype], [])

    # Cast to float32 or float64
    dtype = numpy.find_common_type([a.dtype, b.dtype, 'f'], [])
    a = a.astype(dtype, copy=False)
    b = b.astype(dtype, copy=False)

    if a.dtype.type == numpy.float32:
        dot = cublas.sdot
        gemv = cublas.sgemv
        ger = cublas.sger
        gemm = cublas.sgemm
    elif a.dtype.type == numpy.float64:
        dot = cublas.ddot
        gemv = cublas.dgemv
        ger = cublas.dger
        gemm = cublas.dgemm

    if numpy.isscalar(axes):
        axes = [list(six.moves.range(a.ndim - axes, a.ndim)),
                list(six.moves.range(axes))]
    else:
        axes = list(axes)
    if numpy.isscalar(axes[0]):
        axes[0] = (axes[0],)
    if numpy.isscalar(axes[1]):
        axes[1] = (axes[1],)

    if len(axes) != 2:
        raise ValueError('Axes must consist of two arrays.')
    if len(axes[0]) != len(axes[1]):
        raise ValueError('Axes length mismatch')
    for a_axis, b_axis in zip(*axes):
        if not (-a.ndim <= a_axis < a.ndim and
                -b.ndim <= b_axis < b.ndim):
            raise IndexError('Axis overrun')
        if a.shape[a_axis] != b.shape[b_axis]:
            raise ValueError('Axis dimension mismatch')

    # Make the axes non-negative
    axes = (tuple(axis % a.ndim for axis in axes[0]),
            tuple(axis % b.ndim for axis in axes[1]))

    sum_ndim = len(axes[0])
    a = _move_axes_to_head(a, axes[0])
    b = _move_axes_to_head(b, axes[1])

    m = internal.prod(b.shape[sum_ndim:])
    n = internal.prod(a.shape[sum_ndim:])
    ret_shape = a.shape[sum_ndim:] + b.shape[sum_ndim:]

    if out is not None:
        if out.size != internal.prod(ret_shape):
            raise ValueError('Output array has an invalid size')
        if not out.flags.c_contiguous:
            raise ValueError('Output array must be C-contiguous')

    if 0 in a.shape or 0 in b.shape:
        if 0 not in a.shape or 0 not in b.shape:
            raise ValueError('cannot dot zero-sized and non-zero-sized arrays')
        if out is None:
            return cupy.zeros(ret_shape, dtype=ret_dtype)
        else:
            out.fill(0)
            return out

    if out is None:
        out = cupy.empty(ret_shape, dtype=dtype)
        if dtype == ret_dtype:
            ret = out
        else:
            ret = cupy.empty(ret_shape, dtype=ret_dtype)
    else:
        ret = out
        if out.dtype != dtype:
            out = cupy.empty(ret_shape, dtype=dtype)

    k = a.size // n

    # It copies the operands if needed
    a = a.reshape(k, n)
    b = b.reshape(k, m)
    c = out.view()
    c.shape = (n, m)

    # Be careful that cuBLAS uses the FORTRAN-order matrix representation.
    handle = cuda.Device().cublas_handle
    if k == 1:
        if n == 1 or m == 1:
            # Scalar-vector product
            cupy.multiply(a.T, b, c)
        else:
            # Outer product A^T * B
            # c is C-contiguous while cuBLAS requires F-contiguous arrays, so
            # we compute C^T = B^T * A here.
            c.fill(0)
            a, inca = _to_cublas_vector(a, 1)
            b, incb = _to_cublas_vector(b, 1)
            ger(handle, m, n, 1, b._fptr, incb, a._fptr, inca, c._fptr, m)
    elif n == 1:
        if m == 1:
            # Inner product
            a, inca = _to_cublas_vector(a, 0)
            b, incb = _to_cublas_vector(b, 0)
            mode = cublas.getPointerMode(handle)
            cublas.setPointerMode(handle,
                                  cublas.CUBLAS_POINTER_MODE_DEVICE)
            try:
                dot(handle, k, a._fptr, inca, b._fptr, incb, c._fptr)
            finally:
                cublas.setPointerMode(handle, mode)
        else:
            # Matrix-vector product B^T * A
            a, inca = _to_cublas_vector(a, 1)
            b, transb, ldb = _mat_to_cublas_contiguous(b.T)
            if transb:
                # gemv requires (m, k) as the original matrix dimensions
                # rather than the transposed dimensions.
                m, k = k, m
            gemv(handle, transb, m, k, 1, b._fptr, ldb, a._fptr, inca,
                 0, c._fptr, 1)
    elif m == 1:
        # Matrix-vector product A^T * B
        a, transa, lda = _mat_to_cublas_contiguous(a.T)
        b, incb = _to_cublas_vector(b, 1)
        if not transa:
            # gemv requires (n, k) as the original matrix dimensions rather
            # than the transposed dimensions.
            n, k = k, n
        gemv(handle, transa, n, k, 1, a._fptr, lda, b._fptr, incb, 0, c._fptr,
             1)
    else:
        # Matrix-Matrix product A^T * B
        # c is C-contiguous while cuBLAS assumes F-contiguous inputs, so we
        # compute C^T = B^T * A here.
        a, transa, lda = _mat_to_cublas_contiguous(a)
        b, transb, ldb = _mat_to_cublas_contiguous(b.T)
        gemm(handle, transb, transa, m, n, k, 1, b._fptr, ldb, a._fptr,
             lda, 0, c._fptr, m)

    if dtype != ret_dtype:
        elementwise.copy(out, ret)
    return ret