def getInstructions(app_name='testEfficiency_default', functions=['gauss_solve','buildR_kernel','amplitude_capon'], M=16,L=8): if not ONLY_TIME: def run(M=16,L=4,verbose=False): # print "nvprof -u us --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L) prof = Popen("nvprof -u ns --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() if error == '': if verbose: print output return output else: print output raise Exception(error) events = "inst_issued1_0,inst_issued2_0,inst_issued1_1,inst_issued2_1"#,\ # l2_read_requests,l2_write_requests,l2_read_texture_requests" events_list = re.split(',',events) profiler_output = run(M,L) lines = re.split("\n+", profiler_output) instructions = np.zeros((functions.__len__(),events_list.__len__())) for i,l in enumerate(lines): for j,function in enumerate(functions): if re.search(".+%s.+"%function, l): for k,event in enumerate(events_list): timings = re.sub("^\s*", '', lines[i+k+1]) timings = re.sub("\s*%s.*"%event, '', timings) columns = re.split("\s+", timings) instructions[j,k] = float(columns[1]) else: pass ##From CUPTI Users manual: ## inst_issued1_0 + (inst_issued2_0 * 2) + inst_issued1_1 + (inst_issued2_1 * 2) # print instructions instructions = instructions[:,0] + 2*instructions[:,1] + instructions[:,2] + 2*instructions[:,3] # print "Recorded runtimes [ms]: " # print runtimes[:,3]/1e3 return instructions ## events = "--events inst_issued1_0,inst_issued2_0,inst_issued1_1,inst_issued2_1"#,\ ## l2_read_requests,l2_write_requests,l2_read_texture_requests" else: return np.zeros((3,))
def getMemoryOps(app_name='testEfficiency_default', functions=['gauss_solve', 'buildR_kernel', 'amplitude_capon'], M=16, L=4): if not ONLY_TIME: def run(M=24, L=8, verbose=False): # print "nvprof -u us --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L) prof = Popen( "nvprof -u ns --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d" % (events, app_name, M, L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() if error == '': if verbose: print output return output else: print output raise Exception(error) events = "gld_request,gst_request,shared_load,shared_store" events_list = re.split(',', events) profiler_output = run(M, L) lines = re.split("\n+", profiler_output) memops = np.zeros((functions.__len__(), events_list.__len__())) for i, l in enumerate(lines): for j, function in enumerate(functions): if re.search(".+%s.+" % function, l): for k, event in enumerate(events_list): timings = re.sub("^\s*", '', lines[i + k + 1]) timings = re.sub("\s*%s.*" % event, '', timings) columns = re.split("\s+", timings) memops[j, k] = float(columns[1]) else: pass # print "Recorded runtimes [ms]: " # print runtimes[:,3]/1e3 return memops else: return np.zeros((3, ))
def setUp(self): self.n = 5 self.R = np.zeros((self.n,self.n),dtype=complex) self.a = np.zeros((self.n,),dtype=complex) mu = 2 std = 3 for i in range(self.R.shape[1]): self.a[i] = complex(np.random.normal(mu,std),np.random.normal(mu,std)) for j in range(self.R.shape[0]): self.R[j,i] = complex(np.random.normal(mu,std),np.random.normal(mu,std))
def setUp(self): self.n = 5 self.R = np.zeros((self.n, self.n), dtype=complex) self.a = np.zeros((self.n, ), dtype=complex) mu = 2 std = 3 for i in range(self.R.shape[1]): self.a[i] = complex(np.random.normal(mu, std), np.random.normal(mu, std)) for j in range(self.R.shape[0]): self.R[j, i] = complex(np.random.normal(mu, std), np.random.normal(mu, std))
def getTimings(app_name='testEfficiency_default', functions=['gauss_solve','buildR_kernel','amplitude_capon'], M=8,L=4): if USE_CODE_COUNTERS: prof = Popen("./%s testEfficiency testMVDRKernelPerformance %d %d"%(app_name,M,L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() return np.zeros((3,)) def run(M=24,L=8,verbose=False): # print "nvprof -u us -t 0.1 --devices 0 %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L prof = Popen("nvprof -u ns %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() if error == '': if verbose: print output return output else: print output raise Exception(error) events='' profiler_output = run(M,L) lines = re.split("\n+", profiler_output) runtimes = np.zeros([]) entered_once = False for l in lines: for i,function in enumerate(functions): if re.search(".+%s.+"%function, l): timings = re.sub("^\s*", '', l) timings = re.sub("[\sa-zA-Z]*%s.+"%function, '', timings) columns = re.split("\s+", timings) if not entered_once: entered_once = True runtimes = np.zeros((functions.__len__(),columns.__len__())) runtimes[i] = map(float,columns) break else: pass return runtimes[:,3]/1e3
def getMemoryOps(app_name='testEfficiency_default', functions=['gauss_solve','buildR_kernel','amplitude_capon'], M=16,L=4): if not ONLY_TIME: def run(M=24,L=8,verbose=False): # print "nvprof -u us --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L) prof = Popen("nvprof -u ns --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() if error == '': if verbose: print output return output else: print output raise Exception(error) events = "gld_request,gst_request,shared_load,shared_store" events_list = re.split(',',events) profiler_output = run(M,L) lines = re.split("\n+", profiler_output) memops = np.zeros((functions.__len__(),events_list.__len__())) for i,l in enumerate(lines): for j,function in enumerate(functions): if re.search(".+%s.+"%function, l): for k,event in enumerate(events_list): timings = re.sub("^\s*", '', lines[i+k+1]) timings = re.sub("\s*%s.*"%event, '', timings) columns = re.split("\s+", timings) memops[j,k] = float(columns[1]) else: pass # print "Recorded runtimes [ms]: " # print runtimes[:,3]/1e3 return memops else: return np.zeros((3,))
def uhdu(A, n): ''' Calculates the UDUH decomposition of the Hermitian matrix A such that U is unit upper triangular, D is diagonal and UDU'=A (' = H = symmetric conjugated) Now we avoid using the complex sqrt by instead introducing two complex add Returns [U D] ''' U = eye(n, dtype=A.dtype) D = zeros(n, dtype=A.dtype) for i in range(n): upperColSum = 0 for k in range(i): upperColSum += U[k, i] * U[k, i].conjugate() * D[k] D[i] = A[i, i] - upperColSum for j in range(i + 1, n): upperColSum = 0 for k in range(i): upperColSum += U[k, i].conjugate() * U[k, j] * D[k] U[i, j] = (A[i, j] - upperColSum) / D[i] return [U, D]
def uhdu(A, n): """ Calculates the UDUH decomposition of the Hermitian matrix A such that U is unit upper triangular, D is diagonal and UDU'=A (' = H = symmetric conjugated) Now we avoid using the complex sqrt by instead introducing two complex add Returns [U D] """ U = eye(n, dtype=A.dtype) D = zeros(n, dtype=A.dtype) for i in range(n): upperColSum = 0 for k in range(i): upperColSum += U[k, i] * U[k, i].conjugate() * D[k] D[i] = A[i, i] - upperColSum for j in range(i + 1, n): upperColSum = 0 for k in range(i): upperColSum += U[k, i].conjugate() * U[k, j] * D[k] U[i, j] = (A[i, j] - upperColSum) / D[i] return [U, D]
def diagonalSolve(A, b, n): ''' Solve the diagonal system Ax = b, A is sparse, hence a vector ''' x = np.zeros(n, A.dtype) for i in range(n): x[i] = b[i] / A[i] return x
def testSolveBiCG(self): x = ls.solveBiCG(self.A, self.b1, self.x0_zero, 0, 0) self.assertMatrixAlmosteEqual(self.x1, x, 14) x = ls.solveBiCG(self.complexA, self.complexb, self.x0_zero, 0, 0) self.assertMatrixAlmosteEqual(self.complexx, x, 13) x_ref = np.linalg.solve(self.randA, self.randb) x = ls.solveBiCG(self.randA, self.randb, np.zeros(self.L, dtype=complex), 0, 0) self.assertMatrixAlmosteEqual(x_ref, x, 15)
def uhduGPUProto(A, n): """ Calculates the UDUH decomposition of the Hermitian matrix A such that U is unit upper triangular, D is diagonal and UDU'=A (' = H = symmetric conjugated) Now we avoid using the complex sqrt by instead introducing two complex add Returns [U D] prototype CPU-code for how uhdu-composition should be done on the GPU """ U = eye([n, n], A.dtype) D = zeros([n, 1], A.dtype) upperColSum = zeros([n, 1], A.dtype) # shared column sum buffer Ai = zeros([n, 1], A.dtype) # shared A row buffer for i in range(n): # read one row into "shared" memory for k in range(n): Ai[k, 0] = A[i, k] upperColSum = 0 for k in range(i): upperColSum += U[k, i] * U[k, i].conjugate() * D[k, 0] D[i, 0] = A[i, i] - upperColSum for j in range(i + 1, n): upperColSum = 0 for k in range(i): upperColSum += U[k, i].conjugate() * U[k, j] * D[k, 0] U[i, j] = (A[i, j] - upperColSum) / D[i, 0] return [U, D]
def uhduGPUProto(A, n): ''' Calculates the UDUH decomposition of the Hermitian matrix A such that U is unit upper triangular, D is diagonal and UDU'=A (' = H = symmetric conjugated) Now we avoid using the complex sqrt by instead introducing two complex add Returns [U D] prototype CPU-code for how uhdu-composition should be done on the GPU ''' U = eye([n, n], A.dtype) D = zeros([n, 1], A.dtype) upperColSum = zeros([n, 1], A.dtype) # shared column sum buffer Ai = zeros([n, 1], A.dtype) # shared A row buffer for i in range(n): # read one row into "shared" memory for k in range(n): Ai[k, 0] = A[i, k] upperColSum = 0 for k in range(i): upperColSum += U[k, i] * U[k, i].conjugate() * D[k, 0] D[i, 0] = A[i, i] - upperColSum for j in range(i + 1, n): upperColSum = 0 for k in range(i): upperColSum += U[k, i].conjugate() * U[k, j] * D[k, 0] U[i, j] = (A[i, j] - upperColSum) / D[i, 0] return [U, D]
def testSolveBiCG(self): n = len(self.b1) x0 = mynp.zeros(n, dtype=complex) x = ls.solveBiCG(self.A, self.b1, x0, 0, n) self.assertMatrixAlmosteEqual(self.x1, x, 14) n = len(self.complexb) x = ls.solveBiCG(self.complexA, self.complexb, x0, 0, n) self.assertMatrixAlmosteEqual(self.complexx, x, 12)
def forwardSolve(A, b, n): ''' Forward solve the lower triangular system Ax = b ''' x = np.zeros(n, A.dtype) for i in range(n): x[i] = b[i] for k in range(i): x[i] -= A[i, k] * x[k] x[i] /= A[i, i] return x
def backtrackSolve(A, b, n): ''' Backtrack solve the upper triangular system Ax = b ''' x = np.zeros(n, A.dtype) for i in reversed(range(n)): x[i] = b[i] for k in reversed(range(i+1, n)): x[i] -= A[i, k] * x[k] x[i] /= A[i, i] return x
def backtrackSolve(A, b, n): ''' Backtrack solve the upper triangular system Ax = b ''' x = np.zeros(n, A.dtype) for i in reversed(range(n)): x[i] = b[i] for k in reversed(range(i + 1, n)): x[i] -= A[i, k] * x[k] x[i] /= A[i, i] return x
def butlerMatrix(m, n, ix = 0): ''' Returns the mxn buttler matrix used for beamspace processing ''' ''' The matrix is equal to the normalized n-point DFT-matrix ''' ' An optional list argument ix can be specified to select different beams than the first m' B = mynp.zeros([m, n], dtype=complex) r = range(m) if ix != 0: r = ix for i in r: for j in range(n): B[i, j] = 1/ma.sqrt(n) * cm.exp(-1j*2*cm.pi*i*j/n) return B
def butlerMatrix(m, n, ix=0): ''' Returns the mxn buttler matrix used for beamspace processing ''' ''' The matrix is equal to the normalized n-point DFT-matrix ''' ' An optional list argument ix can be specified to select different beams than the first m' B = mynp.zeros([m, n], dtype=complex) r = range(m) if ix != 0: r = ix for i in r: for j in range(n): B[i, j] = 1 / ma.sqrt(n) * cm.exp(-1j * 2 * cm.pi * i * j / n) return B
def upinterpolate(img): # # img Ny x Nx Ny,Nx = img.shape Kx = 2 # Must be even!!! new_img = np.zeros((Ny,Kx*Nx-Kx),dtype=img.dtype) for i in range(Kx): new_img[:,i::Kx] = img[:,0:-1] + (i+0.5)*(img[:,1:] - img[:,0:-1])/Kx # # new_img[:,0::Kx] = img[:,0:-1] + a*(img[:,1:] - img[:,0:-1]) # new_img[:,1::Kx] = img[:,1:] + 0.75*(img[:,0:-1] - img[:,1:] ) return new_img
def upinterpolate(img): # # img Ny x Nx Ny, Nx = img.shape Kx = 2 # Must be even!!! new_img = np.zeros((Ny, Kx * Nx - Kx), dtype=img.dtype) for i in range(Kx): new_img[:, i::Kx] = img[:, 0:-1] + (i + 0.5) * (img[:, 1:] - img[:, 0:-1]) / Kx # # new_img[:,0::Kx] = img[:,0:-1] + a*(img[:,1:] - img[:,0:-1]) # new_img[:,1::Kx] = img[:,1:] + 0.75*(img[:,0:-1] - img[:,1:] ) return new_img
def cholesky(A, n): ''' Calculates the Cholesky decomposition of the Hermitian matrix A such that U is upper triangular and U'*U = A ''' U = zeros([n, n], A.dtype) for i in range(n): upperColSum = 0 for k in range(i): upperColSum += U[k, i] * (U[k, i]).conjugate() #U[i, i] = cmath.sqrt(A[i, i] - upperColSum) U[i, i] = (A[i, i] - upperColSum)**0.5 for j in range(i + 1, n): upperColSum = 0 for k in range(i): upperColSum += U[k, i].conjugate() * U[k, j] U[i, j] = (A[i, j] - upperColSum) / U[i, i] return U
def cholesky(A, n): """ Calculates the Cholesky decomposition of the Hermitian matrix A such that U is upper triangular and U'*U = A """ U = zeros([n, n], A.dtype) for i in range(n): upperColSum = 0 for k in range(i): upperColSum += U[k, i] * (U[k, i]).conjugate() # U[i, i] = cmath.sqrt(A[i, i] - upperColSum) U[i, i] = (A[i, i] - upperColSum) ** 0.5 for j in range(i + 1, n): upperColSum = 0 for k in range(i): upperColSum += U[k, i].conjugate() * U[k, j] U[i, j] = (A[i, j] - upperColSum) / U[i, i] return U
## ##app_name = "testEfficiency_mathcheck" ##math_run = run() ##events = "--events gld_request,gst_request,shared_load,shared_store" ##math_event_run = run() # # #return info ##export PYTHONPATH="/home/me/Work/UiO/Phd/Code/Profile" ##python testEfficiency.py testMVDRKernelPerformance # ##nvprof ./testEfficiency testEfficiency testMVDRKernelPerformance M = 8 L_list = np.arange(M - 1) + 2 time = np.zeros((L_list.shape[0], 6)) for l, L in enumerate(L_list): time[l] = collectResults(M, L) np.savetxt('time-M%d.txt' % M, time) M = 16 L_list = np.arange(M - 1) + 2 time = np.zeros((L_list.shape[0], 6)) for l, L in enumerate(L_list): time[l] = collectResults(M, L) np.savetxt('time-M%d.txt' % M, time) M = 32 L_list = np.arange(M - 1) + 2 time = np.zeros((L_list.shape[0], 6)) for l, L in enumerate(L_list):
def setUp(self): self.n = 3 self.A = np.array([[2.0, -1.0, 0.0], [-1.0, 2.0, -1.0], [0.0, -1.0, 2.0]],dtype=complex) self.A2 = np.array([[4.0, -2.0, 0.0], [-2.0, 4.0, -2.0], [0.0, -2.0, 4.0]],dtype=complex) self.R = np.array([[1.414213562373095, -0.707106781186547, 0.0], [0.0, 1.224744871391589, -0.816496580927726], [0.0, 0.0, 1.154700538379252]],dtype=complex) self.AA = np.array([[5, -4, 1], [-4, 6, -4], [1, -4, 5]],dtype=complex) self.B = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]],dtype=complex) self.BT = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]],dtype=complex) self.b1 = np.array([1.0, 1.0, 1.0],dtype=complex) self.x1 = np.array([3.0/2, 2.0, 3.0/2],dtype=complex) self.b2 = np.array([1, 2, 3],dtype=complex) self.x2 = np.array([5.0/2, 4, 7.0/2],dtype=complex) self.C = np.array([[1, 2, 3], [0, 1, 1], [0, 0, 1]],dtype=complex) self.b1c = np.array([1, 1, 1],dtype=complex) self.x1c = np.array([-2, 0, 1],dtype=complex) self.x1cT = np.array([1, -1, -1],dtype=complex) self.x0_zero = np.zeros((3,), dtype=complex) self.Ab2 = np.array([0, 0, 4],dtype=complex) self.Ab1 = np.array([1, 0, 1],dtype=complex) self.Ab1b1T = np.array([[3, 1, 3], [1, 6, 5], [3, 5, 11]],dtype=complex) self.invA = np.array([[0.750, 0.50, 0.250], [0.50, 1.0, 0.50], [0.250, 0.50, 0.750]],dtype=complex) self.invAb1b1T = np.array([[0.465909090909091, 0.045454545454545, -0.147727272727273], [0.045454545454545, 0.272727272727273, -0.136363636363636], [-0.147727272727273, -0.136363636363636, 0.193181818181818]],dtype=complex) self.complexA = np.array([[2.0, 3.0 + 1.0j, 2.0 - 2.0j], [3.0 - 1.0j, 9.0, -2.0j], [2.0 + 2.0j, 2.0j, 14.0]], dtype=complex) self.complexR = np.array([[1.414213562373095, 2.121320343559642 + 0.707106781186547j, 1.414213562373095 - 1.414213562373095j], [0.0, 2.0, -1.0 + 1.0j], [0.0, 0.0, 2.828427124746190]], dtype=complex) self.complexb = np.array([1.0, 1.0 + 1.0j, 1.0 - 2.0j], dtype=complex) self.complexy = np.array([0.707106781186547 - 0.0j, -0.250 + 0.750j, -0.353553390593273 - 0.883883476483184j], dtype=complex) self.complexx = np.array([1.593749999999999 - 0.06250j, -0.343750 + 0.281250j, -0.1250 - 0.31250j], dtype=complex) self.complexAb = np.array([2.0 - 2.0j, 8.0 + 6.0j, 14.0 - 24.0j], dtype=complex) self.but4 = np.array([[0.5]*4, [0.5, -0.5j, -0.5, 0.5j], [0.5, -0.5]*2, [0.5, 0.5j, -0.5, -0.5j]], dtype=complex) self.but3 = np.array([[0.577350269189626, 0.577350269189626, 0.577350269189626], [0.577350269189626, -0.288675134594813 - 0.50j, -0.288675134594813 + 0.50j], [0.577350269189626, -0.288675134594813 + 0.50j, -0.288675134594813 - 0.50j]], dtype=complex) self.bsComplexb = np.array([1.732050807568878 - 0.577350269189626j, 1.50 + 0.288675134594813j], dtype=complex) self.diag = 0.2 self.x = np.array([1.0, 1.0 + 1.0j, 1.0 - 2.0j, 2.0 + 1.0j], dtype=complex) self.complexAbbH = np.array([[10.0, 11.0 + 1.0j, 10.0 - 2.0j], [11.0 - 1.0j, 17.0, 8.0 - 2.0j], [10.0 + 2.0j, 8.0 + 2.0j, 22.0]], dtype=complex) self.complexInvAbbH = np.array([[1.067010309278351, -0.407216494845361 - 0.015463917525773j, -0.190721649484536 + 0.020618556701031j], [-0.407216494845361 + 0.015463917525773j, 0.247422680412371, 0.077319587628866 - 0.015463917525773j], [-0.190721649484536 - 0.020618556701031j, 0.077319587628866 + 0.015463917525773j, 0.087628865979381]], dtype=complex) self.complexInvA = np.array([[1.906250, -0.593750 - 0.156250j, -0.250 + 0.18750j], [-0.593750 + 0.156250j, 0.31250, 0.06250 - 0.06250j], [-0.250 - 0.18750j, 0.06250 + 0.06250j, 0.1250]], dtype=complex) self.complexA4x4 = np.array([[22.0, 8.0, 11.0 - 11.0j, 22.0 - 7.0j], [8.0, 22.0, 17.0 - 2.0j, 11.0 - 7.0j], [11.0 + 11.0j, 17.0 + 2.0j, 45.0, 23.0 - 5.0j], [22.0 + 7.0j, 11.0 + 7.0j, 23.0 + 5.0j, 37.0]], dtype=complex) self.U4x4 = np.array([[1.0000, 0.3636, 0.50 - 0.50j, 1.0 - 0.3182j], [0.0, 1.0, 0.6810 + 0.1048j, 0.1571 - 0.2333j], [0.0, 0.0, 1.0, 0.2776 - 0.3670j], [0.0, 0.0, 0.0, 1.0]], dtype=complex) self.D4x4 = np.array([22.0, 19.0909, 24.9381, 5.9806]) self.sonardata_R = np.array(np.load('./data/data_R.npy')) # created without diagonal loading self.sonardata_a = np.array(np.load('./data/data_a.npy')) self.sonardata_Ria = np.array(np.load('./data/data_Ria.npy')) self.sonardata_ar = np.array(np.load('./data/data_ar.npy')) self.sonardata_n = 32 # random data for testing self.L = L = 24 self.d = d = 100 U = np.triu(np.random.randn(L,L) + np.random.randn(L,L)*1j) + np.eye(L)*d self.randA = np.dot(U.conjugate().T, U) self.randb = np.random.randn(L) + np.random.randn(L)*1j
## ##app_name = "testEfficiency_mathcheck" ##math_run = run() ##events = "--events gld_request,gst_request,shared_load,shared_store" ##math_event_run = run() # # #return info ##export PYTHONPATH="/home/me/Work/UiO/Phd/Code/Profile" ##python testEfficiency.py testMVDRKernelPerformance # ##nvprof ./testEfficiency testEfficiency testMVDRKernelPerformance M = 8 L_list = np.arange(M-1)+2 time = np.zeros((L_list.shape[0],6)) for l,L in enumerate(L_list): time[l] = collectResults(M,L) np.savetxt('time-M%d.txt'%M,time) M = 16 L_list = np.arange(M-1)+2 time = np.zeros((L_list.shape[0],6)) for l,L in enumerate(L_list): time[l] = collectResults(M,L) np.savetxt('time-M%d.txt'%M,time) M = 32 L_list = np.arange(M-1)+2 time = np.zeros((L_list.shape[0],6)) for l,L in enumerate(L_list):
def getTimings(app_name='testEfficiency_default', functions=['gauss_solve', 'buildR_kernel', 'amplitude_capon'], M=8, L=4): if USE_CODE_COUNTERS: prof = Popen("./%s testEfficiency testMVDRKernelPerformance %d %d" % (app_name, M, L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() return np.zeros((3, )) def run(M=24, L=8, verbose=False): # print "nvprof -u us -t 0.1 --devices 0 %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L prof = Popen( "nvprof -u ns %s ./%s testEfficiency testMVDRKernelPerformance %d %d" % (events, app_name, M, L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() if error == '': if verbose: print output return output else: print output raise Exception(error) events = '' profiler_output = run(M, L) lines = re.split("\n+", profiler_output) runtimes = np.zeros([]) entered_once = False for l in lines: for i, function in enumerate(functions): if re.search(".+%s.+" % function, l): timings = re.sub("^\s*", '', l) timings = re.sub("[\sa-zA-Z]*%s.+" % function, '', timings) columns = re.split("\s+", timings) if not entered_once: entered_once = True runtimes = np.zeros( (functions.__len__(), columns.__len__())) runtimes[i] = map(float, columns) break else: pass return runtimes[:, 3] / 1e3
def getInstructions( app_name='testEfficiency_default', functions=['gauss_solve', 'buildR_kernel', 'amplitude_capon'], M=16, L=8): if not ONLY_TIME: def run(M=16, L=4, verbose=False): # print "nvprof -u us --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d"%(events,app_name,M,L) prof = Popen( "nvprof -u ns --devices 0 --events %s ./%s testEfficiency testMVDRKernelPerformance %d %d" % (events, app_name, M, L), shell=True, stdout=PIPE, stderr=PIPE) output, error = prof.communicate() if error == '': if verbose: print output return output else: print output raise Exception(error) events = "inst_issued1_0,inst_issued2_0,inst_issued1_1,inst_issued2_1" #,\ # l2_read_requests,l2_write_requests,l2_read_texture_requests" events_list = re.split(',', events) profiler_output = run(M, L) lines = re.split("\n+", profiler_output) instructions = np.zeros((functions.__len__(), events_list.__len__())) for i, l in enumerate(lines): for j, function in enumerate(functions): if re.search(".+%s.+" % function, l): for k, event in enumerate(events_list): timings = re.sub("^\s*", '', lines[i + k + 1]) timings = re.sub("\s*%s.*" % event, '', timings) columns = re.split("\s+", timings) instructions[j, k] = float(columns[1]) else: pass ##From CUPTI Users manual: ## inst_issued1_0 + (inst_issued2_0 * 2) + inst_issued1_1 + (inst_issued2_1 * 2) # print instructions instructions = instructions[:, 0] + 2 * instructions[:, 1] + instructions[:, 2] + 2 * instructions[:, 3] # print "Recorded runtimes [ms]: " # print runtimes[:,3]/1e3 return instructions ## events = "--events inst_issued1_0,inst_issued2_0,inst_issued1_1,inst_issued2_1"#,\ ## l2_read_requests,l2_write_requests,l2_read_texture_requests" else: return np.zeros((3, ))