a_mic = gd.empty(nbands, usemic=True)
b_mic = gd.empty(nbands, usemic=True)
c_mic = stream.bind(c)
np.random.seed(10)
a_mic.array[:] = np.random.random(a_mic.shape)
b_mic.array[:] = np.random.random(b_mic.shape)
# a_mic.update_device()
# b_mic.update_device()

# warm-up
for i in range(3):
    a_mic.update_device()
    b_mic.update_device()
    gd.integrate(a_mic, b_mic, hermitian=False, _transposed_result=c)
    c_mic.update_device()
    mic_gemm(1.0, a_mic, c_mic, 0.0, b_mic)
    b_mic.update_host()
t0 = time()
# equal(np.sum(c), 3600.89536641, 1e-6)
for i in range(repeats):
    a_mic.update_device()
    b_mic.update_device()
    gd.integrate(a_mic, b_mic, hermitian=False, _transposed_result=c)
    c_mic.update_device()
    mic_gemm(1.0, a_mic, c_mic, 0.0, b_mic)
    b_mic.update_host()
t1 = time()
if rank == 0:
    print "Check", np.sum(b_mic.array), "Time", (t1 - t0) / repeats
    def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None):
        """Calculate new linear combinations of wave functions.

        Results will be put in the *P_ani* dict and a new psit_nG returned::

                     __                                __
            ~       \       ~           ~a  ~         \       ~a  ~
           psi  <--  ) C   psi    and  <p |psi >  <--  ) C   <p |psi >
              n     /__ nn'   n'         i    n       /__ nn'  i    n'
                     n'                                n'


        Parameters:

        C_NN: ndarray
            Matrix representation of the requested linear combinations. Even
            with a hermitian operator, this matrix need not be self-adjoint.
            However, unlike the results from calculate_matrix_elements, it is
            assumed that all matrix elements are filled in (use e.g. tri2full).
        psit_nG: ndarray
            Set of vectors in which the matrix elements are evaluated.
        P_ani: dict
            Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>.

        """

        if self.A_nn is None:
            self.allocate_arrays()

        band_comm = self.bd.comm
        B = band_comm.size
        J = self.nblocks
        N = self.bd.mynbands

        C_NN = self.bmd.redistribute_input(C_NN)

        if B == 1 and J == 1:
            # Simple case:
            if use_mic:
                work_nG = self.work1_xG_mic
            else:
                work_nG = reshape(self.work1_xG, psit_nG.shape)
            if out_nG is None:
                out_nG = work_nG
                # out_nG[:] = 117  # gemm may not like nan's
            elif out_nG is psit_nG:
                work_nG[:] = psit_nG
                psit_nG = work_nG

            if use_mic:
                if self.gd.comm.rank == 0:
                    offload_report(1)
                C_NN_mic = self.A_nn_mic
                C_NN_mic.array[:] = C_NN[:]
                C_NN_mic.update_device()
                stream.sync()
                mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG)
                if self.gd.comm.rank == 0:
                    offload_report(0)
            else:
                self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG)
            if P_ani:
                for P_ni in P_ani.values():
                    gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni)
            return out_nG

        # Now it gets nasty! We parallelize over B groups of bands and
        # each grid chunk is divided in J smaller slices (less memory).

        Q = B  # always non-hermitian XXX
        rank = band_comm.rank
        shape = psit_nG.shape
        psit_nG = psit_nG.reshape(N, -1)
        G = psit_nG.shape[1]  # number of grid-points
        g = int(np.ceil(G / float(J)))

        # Buffers for send/receive of pre-multiplication versions of P_ani's.
        sbuf_nI = rbuf_nI = None
        if P_ani:
            sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()])
            sbuf_nI = np.ascontiguousarray(sbuf_nI)
            if B > 1:
                rbuf_nI = np.empty_like(sbuf_nI)

        # Because of the amount of communication involved, we need to
        # be syncronized up to this point but only on the 1D band_comm
        # communication ring
        band_comm.barrier()
        while g * J >= G + g:  # remove extra slice(s)
            J -= 1
        assert 0 < g * J < G + g

        work1_xG = reshape(self.work1_xG, (self.X, ) + psit_nG.shape[1:])
        work2_xG = reshape(self.work2_xG, (self.X, ) + psit_nG.shape[1:])

        for j in range(J):
            G1 = j * g
            G2 = G1 + g
            if G2 > G:
                G2 = G
                g = G2 - G1
            sbuf_ng = reshape(work1_xG, (N, g))
            rbuf_ng = reshape(work2_xG, (N, g))
            sbuf_ng[:] = psit_nG[:, G1:G2]
            beta = 0.0
            cycle_P_ani = (j == J - 1 and P_ani)
            for q in range(Q):
                # Start sending currently buffered kets to rank below
                # and receiving next set of kets from rank above us.
                # If we're at the last slice, start cycling P_ani too.
                if q < Q - 1:
                    self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI,
                                           cycle_P_ani)

                # Calculate wave-function contributions from the current slice
                # of grid data by the current mynbands x mynbands matrix block.
                C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank)
                self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2])

                # If we're at the last slice, add contributions to P_ani's.
                if cycle_P_ani:
                    I1 = 0
                    for P_ni in P_ani.values():
                        I2 = I1 + P_ni.shape[1]
                        gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni)
                        I1 = I2

                # Wait for all send/receives to finish before next iteration.
                # Swap send and receive buffer such that next becomes current.
                # If we're at the last slice, also finishes the P_ani cycle.
                if q < Q - 1:
                    sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle(
                        sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani)

                # First iteration was special because we initialized the kets
                if q == 0:
                    beta = 1.0

        psit_nG.shape = shape
        return psit_nG
    def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None):
        """Calculate new linear combinations of wave functions.

        Results will be put in the *P_ani* dict and a new psit_nG returned::

                     __                                __
            ~       \       ~           ~a  ~         \       ~a  ~
           psi  <--  ) C   psi    and  <p |psi >  <--  ) C   <p |psi >
              n     /__ nn'   n'         i    n       /__ nn'  i    n'
                     n'                                n'


        Parameters:

        C_NN: ndarray
            Matrix representation of the requested linear combinations. Even
            with a hermitian operator, this matrix need not be self-adjoint.
            However, unlike the results from calculate_matrix_elements, it is
            assumed that all matrix elements are filled in (use e.g. tri2full).
        psit_nG: ndarray
            Set of vectors in which the matrix elements are evaluated.
        P_ani: dict
            Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>.

        """

        if self.A_nn is None:
            self.allocate_arrays()

        band_comm = self.bd.comm
        B = band_comm.size
        J = self.nblocks
        N = self.bd.mynbands

        C_NN = self.bmd.redistribute_input(C_NN)

        if B == 1 and J == 1:
            # Simple case:
            if use_mic:
                work_nG = self.work1_xG_mic
            else:
                work_nG = reshape(self.work1_xG, psit_nG.shape)
            if out_nG is None:
                out_nG = work_nG
                # out_nG[:] = 117  # gemm may not like nan's
            elif out_nG is psit_nG:
                work_nG[:] = psit_nG
                psit_nG = work_nG

            if use_mic:
                if self.gd.comm.rank == 0:
                    offload_report(1)
                C_NN_mic = self.A_nn_mic
                C_NN_mic.array[:] = C_NN[:]
                C_NN_mic.update_device()
                stream.sync()
                mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG)
                if self.gd.comm.rank == 0:
                    offload_report(0)
            else:
                self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG)
            if P_ani:
                for P_ni in P_ani.values():
                    gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni)
            return out_nG
        
        # Now it gets nasty! We parallelize over B groups of bands and
        # each grid chunk is divided in J smaller slices (less memory).

        Q = B  # always non-hermitian XXX
        rank = band_comm.rank
        shape = psit_nG.shape
        psit_nG = psit_nG.reshape(N, -1)
        G = psit_nG.shape[1]  # number of grid-points
        g = int(np.ceil(G / float(J)))

        # Buffers for send/receive of pre-multiplication versions of P_ani's.
        sbuf_nI = rbuf_nI = None
        if P_ani:
            sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()])
            sbuf_nI = np.ascontiguousarray(sbuf_nI)
            if B > 1:
                rbuf_nI = np.empty_like(sbuf_nI)

        # Because of the amount of communication involved, we need to
        # be syncronized up to this point but only on the 1D band_comm
        # communication ring
        band_comm.barrier()
        while g * J >= G + g:  # remove extra slice(s)
            J -= 1
        assert 0 < g * J < G + g

        work1_xG = reshape(self.work1_xG, (self.X,) + psit_nG.shape[1:])
        work2_xG = reshape(self.work2_xG, (self.X,) + psit_nG.shape[1:])

        for j in range(J):
            G1 = j * g
            G2 = G1 + g
            if G2 > G:
                G2 = G
                g = G2 - G1
            sbuf_ng = reshape(work1_xG, (N, g))
            rbuf_ng = reshape(work2_xG, (N, g))
            sbuf_ng[:] = psit_nG[:, G1:G2]
            beta = 0.0
            cycle_P_ani = (j == J - 1 and P_ani)
            for q in range(Q):
                # Start sending currently buffered kets to rank below
                # and receiving next set of kets from rank above us.
                # If we're at the last slice, start cycling P_ani too.
                if q < Q - 1:
                    self._initialize_cycle(sbuf_ng, rbuf_ng,
                                           sbuf_nI, rbuf_nI, cycle_P_ani)

                # Calculate wave-function contributions from the current slice
                # of grid data by the current mynbands x mynbands matrix block.
                C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank)
                self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2])

                # If we're at the last slice, add contributions to P_ani's.
                if cycle_P_ani:
                    I1 = 0
                    for P_ni in P_ani.values():
                        I2 = I1 + P_ni.shape[1]
                        gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni)
                        I1 = I2

                # Wait for all send/receives to finish before next iteration.
                # Swap send and receive buffer such that next becomes current.
                # If we're at the last slice, also finishes the P_ani cycle.
                if q < Q - 1:
                    sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle(
                        sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani)

                # First iteration was special because we initialized the kets
                if q == 0:
                    beta = 1.0

        psit_nG.shape = shape
        return psit_nG
    def integrate(self, a_xg, b_yg=None,
                  global_integral=True, hermitian=False,
                  _transposed_result=None):
        """Integrate function(s) over domain.

        a_xg: ndarray
            Function(s) to be integrated.
        b_yg: ndarray
            If present, integrate a_xg.conj() * b_yg.
        global_integral: bool
            If the array(s) are distributed over several domains, then the
            total sum will be returned.  To get the local contribution
            only, use global_integral=False.
        hermitian: bool
            Result is hermitian.
        _transposed_result: ndarray
            Long story.  Don't use this unless you are a method of the
            MatrixOperator class ..."""
        
        xshape = a_xg.shape[:-3]
        
        if b_yg is None:
            # Only one array:
            result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv
            if global_integral:
                if result.ndim == 0:
                    result = self.comm.sum(result)
                else:
                    self.comm.sum(result)
            return result

        if isinstance(a_xg, mic.OffloadArray):
            # offload arrays have to be contiguous in any case
            A_xg = a_xg
            B_yg = b_yg
        else:
            A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:]))
            B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:]))

        if _transposed_result is None:
            result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype)
        else:
            result_yx = _transposed_result
            global_integral = False

        if isinstance(a_xg, mic.OffloadArray):
            result_yx_mic = stream.bind(result_yx)
            stream.sync()
            # result_yx_mic.fillfrom(result_yx)
            # result_yx_mic.array[:] = result_yx[:]
            # result_yx_mic.update_device()

        if a_xg is b_yg:
            if isinstance(a_xg, mic.OffloadArray):
                # dsyrk performs badly in MIC so use dgemm here
                # mic_rk(self.dv, A_xg, 0.0, result_yx_mic)
                mic_gemm(self.dv, A_xg, A_xg, 0.0, result_yx_mic, 'c')
            else:
                rk(self.dv, A_xg, 0.0, result_yx)
        elif hermitian:
            if isinstance(a_xg, mic.OffloadArray):
                mic_r2k(self.dv, A_xg, B_yg, 0.0, result_yx_mic)
            else:
                r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx)
        else:
            if isinstance(a_xg, mic.OffloadArray):
                mic_gemm(self.dv, A_xg, B_yg, 0.0, result_yx_mic, 'c')
            else:
                gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c')
        
        if isinstance(a_xg, mic.OffloadArray):
            result_yx_mic.update_host()
            stream.sync()

        if global_integral:
            self.comm.sum(result_yx)

        yshape = b_yg.shape[:-3]
        result = result_yx.T.reshape(xshape + yshape)
        
        if result.ndim == 0:
            return result.item()
        else:
            return result
Example #5
0
# a_mic.array = a_mic.array.reshape(a_mic.shape[:-3] + (-1,))
a_mic.update_host()
a_mic_sum = np.sum(a_mic.array)
print "   sum(a_mic)=" + str(a_mic_sum)

b_mic = mic.offload_array(b.shape, dtype=float)
b_mic.fillfrom(b)
# b_mic.update_device()
# b_mic.array = b_mic.array.reshape(b_mic.shape[:-3] + (-1,))
b_mic.update_host()
b_mic_sum = np.sum(b_mic.array)
print "   sum(b_mic)=" + str(b_mic_sum)

# c_mic = offload_array(c.shape, dtype=float)
# c_mic.fill(0.0);
c_mic = device.associate(c)
c_mic.update_device()

t0 = time()
for r in range(repeats):
    mic_gemm(alpha, a_mic, b_mic, beta, c_mic, "c")
    #mic_r2k(alpha, a_mic, b_mic, beta, c_mic)
    c_mic.update_host()
    c[:] = c_mic.array[:]
t1 = time()
print "MIC time", t1 - t0
print "MIC checks"
c_mic_sum = np.sum(c_mic.array)
print "    sum(c_mic)=" + str(c_mic_sum)
print "    sum(c)=" + str(np.sum(c))