def get_train_features(src_domain, tgt_domain): srcfile_loc = osp.join(DATA_DIR, src_domain + '.txt') tgtfile_loc = osp.join(DATA_DIR, tgt_domain + '.txt') # 首先收集字典,用于生成特征向量 vocab_all = create_vocab_all(srcfile_loc, tgtfile_loc) src_authors, src_author_paper = get_train_author_paper(srcfile_loc) tgt_authors, tgt_author_paper = get_train_author_paper(tgtfile_loc) src_paper_token = get_train_paper_token(srcfile_loc, vocab_all) tgt_paper_token = get_train_paper_token(tgtfile_loc, vocab_all) src_features = blas.sgemm(alpha=1.0, a=src_author_paper, b=src_paper_token.T, trans_b=True) tgt_features = blas.sgemm(alpha=1.0, a=tgt_author_paper, b=tgt_paper_token.T, trans_b=True) feature_dict = {} feature_dict['src_authors'] = src_authors feature_dict['tgt_authors'] = tgt_authors feature_dict['src_paper_author'] = src_author_paper.T feature_dict['tgt_paper_author'] = tgt_author_paper.T feature_dict['src_features'] = src_features feature_dict['tgt_features'] = tgt_features feature_dict['vocab_all'] = vocab_all return feature_dict
def test_gemm_323(self): A = numpy.arange(6).reshape((2, 3)) + 1 B = numpy.arange(6).reshape((3, 2)) + 10 for dtype in [numpy.float32, numpy.float64, numpy.int64]: a = A.astype(dtype) b = B.astype(dtype) for t1 in [False, True]: for t2 in [False, True]: with self.subTest(dtype=dtype, transA=t1, transB=t2, shapeA=a.shape, shapeB=b.shape): ta = a.T if t1 else a tb = b.T if t2 else b try: exp = ta @ tb except ValueError: continue if t1: M = a.shape[1] lda = a.shape[0] K = a.shape[0] else: M = a.shape[0] lda = a.shape[0] K = a.shape[1] if t2: N = b.shape[0] ldb = b.shape[1] else: N = b.shape[1] ldb = b.shape[1] ldc = N c = numpy.empty(M * N, dtype=a.dtype) pygemm(t2, t1, N, M, K, 1., b.ravel(), ldb, a.ravel(), lda, 0., c, ldc) cc = c.reshape((M, N)) # self.assertEqualArray(exp, cc) if dtype == numpy.float32: res = sgemm(1, a, b, 0, cc, t1, t2) self.assertEqualArray(exp, res) cc[:, :] = 0 sgemm(1, a, b, 0, cc, t1, t2, 1) try: self.assertEqualArray(exp, cc) except AssertionError: # Overwriting the result does not seem # to work. pass got = gemm_dot(a, b, t1, t2) self.assertEqualArray(exp, got)
def _compute_reprs(net_in, net, layers_style, layers_content, gram_scale=1): """ Computes representation matrices for an image. :param net_in: content image or style image :param net: caffe network :param layers_style: layers selected for Style Target. If net_in is content image, this should be [] :param layers_content: layers selected for Content Target. If net_in is style image, this should be [] """ # input data and forward pass (repr_s, repr_c) = ({}, {}) net.blobs["data"].data[0] = net_in net.forward() # decide if net_in is content image or style image if layers_style == []: repr_s = {} if layers_content == []: repr_c = {} """ TODO #6 Calculate representations for content and style """ for layer in (set(layers_style) | set(layers_content)): f_sc = net.blobs[layer].data[0].copy() (a, b, c) = f_sc.shape f_sc = np.reshape(f_sc, (a, b * c)) repr_c[layer] = f_sc if layer in layers_style: repr_s[layer] = sgemm(gram_scale, f_sc, f_sc.T) return repr_s, repr_c
def matrix_mult(i,len_i,j,len_j,mat_size,name_A,name_B,name_C): #The different cores often have separate the timeer, so we mark the time before any calculation so that results can be offset to the same starting time stabiliser_time = time.time() #Identifies the shared memory blocks of A,B,C existing_shm_A = shared_memory.SharedMemory(name=name_A) existing_shm_B = shared_memory.SharedMemory(name=name_B) existing_shm_C = shared_memory.SharedMemory(name=name_C) #Calculates the (i,j) coodinates of the submatrices that must be worked on i1 = i*len_i i2 = (i+1)*len_i j1 = j*len_j j2 = (j+1)*len_j #Reads the relevant block of A,B,C from shared memory sub_mat_A = np.ndarray((mat_size,mat_size), dtype=np.float32, buffer=existing_shm_A.buf)[i1:i2,:] sub_mat_B = np.ndarray((mat_size,mat_size), dtype=np.float32, buffer=existing_shm_B.buf)[:,j1:j2] sub_mat_C = np.ndarray((mat_size,mat_size), dtype=np.float32, buffer=existing_shm_C.buf) #Marks the start of the calculation time calc_start = time.time() #---------------------------------------------- #Calculates the submatrix C' using sgemm and saves it to shared memory sub_mat_C[i1:i2,j1:j2] = FB.sgemm(alpha=1.0, a=sub_mat_A, b=sub_mat_B) #---------------------------------------------- #<arks the end of the calculation time calc_finish = time.time() #Closes the link to the shared memory blocks existing_shm_A.close() existing_shm_B.close() existing_shm_C.close() #Returns all the timiing results return stabiliser_time, calc_start, calc_finish
def test_gemm_1(self): A = numpy.arange(1).reshape((1, 1)) + 1 B = numpy.arange(1).reshape((1, 1)) + 10 for dtype in [numpy.float32, numpy.float64, numpy.int64]: a = A.astype(dtype) b = B.astype(dtype) for t1 in [False, True]: for t2 in [False, True]: with self.subTest(dtype=dtype, transA=t1, transB=t2, shapeA=a.shape, shapeB=b.shape): ta = a.T if t1 else a tb = b.T if t2 else b exp = ta @ tb got = gemm_dot(a, b, t1, t2) self.assertEqualArray(exp, got) M, N, K = 1, 1, 1 lda, ldb, ldc = 1, 1, 1 c = numpy.empty(M * N, dtype=a.dtype) pygemm(t2, t1, M, N, K, 1., b.ravel(), ldb, a.ravel(), lda, 0., c, ldc) cc = c.reshape((M, N)) self.assertEqualArray(exp, cc) if dtype == numpy.float32: res = sgemm(1, a, b, 0, cc, t1, t2) self.assertEqualArray(exp, res)
def matrix_mult(i, len_i, j, len_j, mat_size, name_A, name_B, name_C): stabiliser_time = time.time() existing_shm_A = shared_memory.SharedMemory(name=name_A) existing_shm_B = shared_memory.SharedMemory(name=name_B) existing_shm_C = shared_memory.SharedMemory(name=name_C) i1 = i * len_i i2 = (i + 1) * len_i j1 = j * len_j j2 = (j + 1) * len_j sub_mat_A = np.ndarray((mat_size, mat_size), dtype=np.float32, buffer=existing_shm_A.buf)[i1:i2, :] sub_mat_B = np.ndarray((mat_size, mat_size), dtype=np.float32, buffer=existing_shm_B.buf)[:, j1:j2] sub_mat_C = np.ndarray((mat_size, mat_size), dtype=np.float32, buffer=existing_shm_C.buf) calc_start = time.time() sub_mat_C[i1:i2, j1:j2] = FB.sgemm(alpha=1.0, a=sub_mat_A, b=sub_mat_B) calc_finish = time.time() existing_shm_A.close() existing_shm_B.close() existing_shm_C.close() return stabiliser_time, calc_start, calc_finish
def _gram(self, layer): """ Compute gram matrix; just the dot product of the layer and its transform """ gram = blas.sgemm(1.0, layer, layer.T) return gram
def style_lag(self, noisies, grams, i, compute_grad=False): """ Compute style losses and gradients for all gram matrices This is compressed into one function to save intermediate computations. Is assumed that gram matrices and self.style_targets correspond to identical layers. """ # Get everything. style_noisy = noisies[i] style_gram = grams[i] style_target = self.style_targets[i] weight = STYLE_WEIGHTS[i] diff = (style_gram - style_target) size_c = (1. / ((style_noisy.shape[0] ** 2) * (style_noisy.shape[1] ** 2))) loss = (size_c / 4) * (diff**2).sum() * weight if compute_grad: gradient = (size_c * blas.sgemm(1.0, diff, style_noisy) * (style_noisy > 0) * weight) return loss, gradient return loss, None
def style_lag(self, noisies, grams, i, compute_grad=False): """ Compute style losses and gradients for all gram matrices This is compressed into one function to save intermediate computations. Is assumed that gram matrices and self.style_targets correspond to identical layers. """ # Get everything. style_noisy = noisies[i] style_gram = grams[i] style_target = self.style_targets[i] weight = STYLE_WEIGHTS[i] diff = (style_gram - style_target) size_c = (1. / ((style_noisy.shape[0]**2) * (style_noisy.shape[1]**2))) loss = (size_c / 4) * (diff**2).sum() * weight if compute_grad: gradient = (size_c * blas.sgemm(1.0, diff, style_noisy) * (style_noisy > 0) * weight) return loss, gradient return loss, None
def test_scipy_sgemm(): pair_domain = four_pair_of_domains[0] src_domain, tgt_domain = pair_domain[0], pair_domain[1] srcfile_loc = osp.join(DATA_DIR, src_domain + '.txt') tgtfile_loc = osp.join(DATA_DIR, tgt_domain + '.txt') src_names, src_name_line_count = get_names_counts(srcfile_loc) tgt_names, tgt_name_line_count = get_names_counts(tgtfile_loc) common_tokens = get_common_tokens(srcfile_loc, tgtfile_loc) print('src', src_name_line_count.shape) print('tgt', tgt_name_line_count.shape) print('common', len(common_tokens)) src_line_token_count = get_tokens_counts(srcfile_loc, common_tokens) tgt_line_token_count = get_tokens_counts(tgtfile_loc, common_tokens) print('src token', src_line_token_count.shape) print('tgt token', tgt_line_token_count.shape) res = blas.sgemm(alpha=1.0, a=src_name_line_count, b=src_line_token_count.T, trans_b=True) print('we need <dot result> == <sgemm result>') for i in range(10): X, Y = np.nonzero(res) rand_ind = random.choice(range(len(X))) xx = X[rand_ind] yy = Y[rand_ind] print('dot res:', np.dot(src_name_line_count[xx, :], src_line_token_count[:, yy])) print('sgemm res:', res[xx, yy]) print()
def matrix_mult(i1, i2, j1, j2, mat_size): A_np = np.frombuffer(var_dict['A'], dtype=np.float32).reshape( (mat_size, mat_size)) B_np = np.frombuffer(var_dict['B'], dtype=np.float32).reshape( (mat_size, mat_size)) #mat_C = np.zeros((i2-i1,j2-j1)) mat_C = FB.sgemm(alpha=1.0, a=A_np[i1:i2, :], b=B_np[:, j1:j2]) return mat_C
def gram_matrix(X, y=None, kernel="linear", bandwidth=1, centered=False): k = None if kernel == "linear": if y is None: k = sgemm(alpha=1.0, a=X, b=X, trans_b=True) else: k = sgemm(alpha=1.0, a=X, b=y, trans_b=True) elif kernel == "rbf": # print("rbf kernel: ") euc_dist = np.einsum('ij,ij->i', X, X) if y is not None: euc_dist_y = np.einsum('ij,ij->i', y, y) else: euc_dist_y = euc_dist y = X k = ne.evaluate( 'exp(-b * (A + B - 2 * C))', { 'A': euc_dist[:, None], 'B': euc_dist_y[None, :], 'C': sgemm(alpha=1.0, a=X, b=y, trans_b=True), 'b': bandwidth, }) # elif kernel == "rbf": # print("rbf kernel: ") # euc_dist = np.einsum('ij,ij->i', X, X) # k = ne.evaluate('exp(-b * (A + B - 2 * C))', { # 'A': euc_dist[:, None], # 'B': euc_dist[None, :], # 'C': sgemm(alpha=1.0, a=X, b=X, trans_b=True), # 'b': bandwidth, # }) if centered and k.shape[0] == k.shape[1]: N = X.shape[0] identity_n = np.ones((N, N)) / N first_term = k second_term = np.dot(identity_n, k) third_term = np.dot(k, identity_n) fourth_term = np.dot(identity_n, third_term) k = first_term - second_term - third_term + fourth_term return k
def main(): lock = int(sys.argv[1]) size_list = [2**i for i in range(8,16)] no_runs = 10 time_df = pd.DataFrame(columns=["My function (Python)", "My function (32 Cores Python)", "matmul (NumPy Python)", "dgemm (Python)", "sgemm (Python)"]) for mat_size in size_list: print(f"Mat size: {mat_size}") for i in range(no_runs): print(f"i: {i}") m1 = np.random.rand(mat_size,mat_size).astype(np.float32) m2 = np.random.rand(mat_size,mat_size).astype(np.float32) new_times=[] time.sleep(10) if mat_size =< 2048: my_func_start = time.perf_counter() m_myfunc = matrix_mult(m1,m2) my_func_finish = time.perf_counter() new_times.append(round(my_func_finish-my_func_start,8)) my_func_32cores_start = time.perf_counter() time_taken, m_myfunc32 = gen_time_results(mat_size,32,m1,m2) my_func_32cores_finish = time.perf_counter() new_times.append(round(my_func_32cores_finish-my_func_32cores_start,8)) else: new_times.append(None) new_times.append(None) numpy_start = time.perf_counter() mn = np.matmul(m1,m2) numpy_finish = time.perf_counter() new_times.append(round(numpy_finish-numpy_start,8)) dgemm_start = time.perf_counter() md = FB.dgemm(alpha=1.0, a=m1, b=m2) dgemm_finish = time.perf_counter() new_times.append(round(dgemm_finish-dgemm_start,8)) sgemm_start = time.perf_counter() ms = FB.sgemm(alpha=1.0, a=m1, b=m2) sgemm_finish = time.perf_counter() new_times.append(round(sgemm_finish-sgemm_start,8)) print(new_times) time_df = time_df.append( pd.DataFrame(columns=["My function (Python)","My function (32 Cores Python)", "matmul (NumPy_Python)","dgemm (Python)","sgemm (Python)"],index=[mat_size]) ) if lock: time_df.to_pickle("time_df_libraries_lock.pkl") else: time_df.to_pickle("time_df_libraries_no_lock.pkl")
def style_loss(F, A, layer, style_layers): idx = style_layers.index(layer)+1 Fl = np.squeeze(F[idx]) Al = np.squeeze(A[idx]) channel, row, col = Fl.shape Fl = Fl.reshape((channel, row*col)) Al = Al.reshape((channel, row*col)) gram_F = sgemm(1, Fl, Fl.T) gram_A = sgemm(1, Al, Al.T) denom = (2*channel*row*col)**2 loss = np.sum((gram_F-gram_A)**2) / denom grad = 4 * sgemm(1, gram_F-gram_A, Fl) * (Fl > 0) / denom return loss, grad
def rbf_kernel_fast(X, precision): gamma = precision / 2 X_norm = -gamma * np.einsum('ij,ij->i', X, X) return ne.evaluate( 'exp(A + B + C)', { 'A': X_norm[:, None], 'B': X_norm[None, :], 'C': sgemm(alpha=2.0 * gamma, a=X, b=X, trans_b=True), 'g': gamma, })
def _compute_style_grad(F, G, G_style, layer): """ Computes style gradient and loss from activation features. """ # compute loss and gradient (Fl, Gl) = (F[layer], G[layer]) c = Fl.shape[0]**-2 * Fl.shape[1]**-2 El = Gl - G_style[layer] loss = c / 4 * (El**2).sum() grad = c * sgemm(1.0, El, Fl) * (Fl > 0) return loss, grad
def _compute_style_grad(F, G, G_style, layer): """ Computes style gradient and loss from activation features. """ # compute loss and gradient (Fl, Gl) = (F[layer], G[layer]) c = Fl.shape[0]**-2 * Fl.shape[1]**-2 El = Gl - G_style[layer] loss = c/4 * (El**2).sum() grad = c * sgemm(1.0, El, Fl) * (Fl>0) return loss, grad
def rbf(X, Y=None, gamma=1.0, gradient=False): """ Compute row wise kernel matrix of X and Y. Parameters ---------- X : numpy array first array of size (n x m). Y : numpy array, optional second array of size (o x m). The default is None. gamma : float, optional Length scale. The default is 0.5. Returns ------- kernel matrix as numpy array of size (n x o). """ XX = np.einsum('ij,ij -> i', X, X) if Y is None: Y = X YY = XX Y_flag = True else: YY = np.einsum('ij,ij -> i', Y, Y) Y_flag = False dist = ne.evaluate( '(A + B - C) / g**2', { 'A': XX[:, None], 'B': YY[None, :], 'C': sgemm(alpha=2, a=X, b=Y, trans_b=True), 'g': gamma }) if Y_flag: np.fill_diagonal(dist, 0) K = np.exp(-0.5 * dist) if gradient: grad = K * dist np.fill_diagonal(grad, 0) return K, grad return K
def gen_time_results(mat_size, no_runs): mat_A = np.random.rand(mat_size, mat_size) mat_B = np.random.rand(mat_size, mat_size) time_list_d = [] time_list_s = [] for _ in range(no_runs): start = time.perf_counter() result = FB.dgemm(alpha=1, a=mat_A, b=mat_B) finish = time.perf_counter() time_taken_d = round(finish - start, 10) time_list_d.append(time_taken_d) start = time.perf_counter() result = FB.sgemm(alpha=1, a=mat_A, b=mat_B) finish = time.perf_counter() time_taken_s = round(finish - start, 10) time_list_s.append(time_taken_s) return time_list_d, time_list_s
def _compute_reprs(net, layers_style, layers_content, net_in, scale_gram=1): """ Computes representation matrices for an image. """ # copy activations to output from forward pass (repr_s, repr_c) = ({}, {}) net.blobs["data"].data[0] = net_in net.forward(end=net.params.keys()[-1]) # loop through combined set of layers for layer in set(layers_style) | set(layers_content): F = net.blobs[layer].data[0].copy() F.shape = (F.shape[0], -1) repr_c[layer] = F if layer in layers_style: repr_s[layer] = sgemm(scale_gram, F, F.T) return repr_s, repr_c
def _compute_reprs(net, layers_style, layers_content, net_in, scale_gram=1): """ Computes representation matrices for an image. """ # copy activations to output from forward pass (repr_s, repr_c) = ({}, {}) net.blobs["data"].data[0] = net_in net.forward(end=net.params.keys()[-1]) # loop through combined set of layers for layer in set(layers_style)|set(layers_content): F = net.blobs[layer].data[0].copy() F.shape = (F.shape[0], -1) repr_c[layer] = F if layer in layers_style: repr_s[layer] = sgemm(scale_gram, F, F.T) return repr_s, repr_c
def _compute_reprs(net_in, net, layers_style, layers_content, gram_scale=1): """ Computes representation matrices for an image. """ # input data and forward pass (repr_s, repr_c) = ({}, {}) net.blobs["data"].data[0] = net_in net.forward() # loop through combined set of layers for layer in set(layers_style)|set(layers_content): F = net.blobs[layer].data[0].copy() F.shape = (F.shape[0], -1) repr_c[layer] = F if layer in layers_style: repr_s[layer] = sgemm(gram_scale, F, F.T) return repr_s, repr_c
def _compute_reprs(net_in, net, layers_style, layers_content, gram_scale=1): """ Computes representation matrices for an image. """ # input data and forward pass (repr_s, repr_c) = ({}, {}) net.blobs["data"].data[0] = net_in net.forward() # loop through combined set of layers for layer in set(layers_style) | set(layers_content): F = net.blobs[layer].data[0].copy() F.shape = (F.shape[0], -1) repr_c[layer] = F if layer in layers_style: repr_s[layer] = sgemm(gram_scale, F, F.T) return repr_s, repr_c
def matrix_mult(mat_A, mat_B): calc_start = time.perf_counter() mat_C = FB.sgemm(alpha=1.0, a=mat_A, b=mat_B) calc_finish = time.perf_counter() return mat_C, calc_start, calc_finish
for mat_size in mat_sizes: print(f"Mat size: {mat_size}") total_time_DGEMM = 0 for _ in range(no_runs): m1 = np.random.rand(mat_size, mat_size) m2 = np.random.rand(mat_size, mat_size) start = time.perf_counter() md = FB.dgemm(alpha=1, a=m1, b=m2) finish = time.perf_counter() time_taken = round(finish - start, 8) total_time_DGEMM += time_taken #assert md.all() == ans.all() print(total_time_DGEMM / no_runs) print("\n") print("---- LAPACK SGEMM----") for mat_size in mat_sizes: print(f"Mat size: {mat_size}") total_time_SGEMM = 0 for _ in range(no_runs): m1 = np.random.rand(mat_size, mat_size) m2 = np.random.rand(mat_size, mat_size) start = time.perf_counter() ms = FB.sgemm(alpha=1, a=m1, b=m2) finish = time.perf_counter() time_taken = round(finish - start, 8) total_time_SGEMM += time_taken #assert ms.all() == ans.all() print(total_time_SGEMM / no_runs) print("\n")
send_list = [] for i in range(i_len): for j in range(j_len): send_list.append([send_list_A[i], send_list_B[j]]) #mat_A = None #mat_B = None else: mat_A = None mat_B = None send_list = None mats = comm.scatter(send_list, root=0) calc_start = MPI.Wtime() mat_C = FB.sgemm(alpha=1.0, a=mats[0], b=mats[1]) calc_finish = MPI.Wtime() res_list = comm.gather(mat_C, root=0) if rank == 0: res = np.vstack(np.split(np.concatenate(res_list, axis=1), i_len, axis=1)) total_finish = MPI.Wtime() scatter_time = calc_start - total_start calc_time = calc_finish - calc_start gather_time = total_finish - calc_finish scatter_sum = np.zeros(0)
fh_A = MPI.File.Open(comm, f"mat_A/mat_A_{mat_size}_{iteration}.txt", amode_A) buf_mat_A = np.empty((i_size, mat_size), dtype=np.float32) offset_A = i_coord * buf_mat_A.nbytes fh_A.Read_at_all(offset_A, buf_mat_A) fh_A.Close() #Opening and reading matrix B fh_B = MPI.File.Open(comm, f"mat_B/mat_B_{mat_size}_{iteration}.txt", amode_B) buf_mat_B = np.empty((j_size, mat_size), dtype=np.float32) offset_B = j_coord * buf_mat_B.nbytes fh_B.Read_at_all(offset_B, buf_mat_B) mat_B = np.transpose(buf_mat_B) fh_B.Close() calc_start = MPI.Wtime() buf_mat_C = FB.sgemm(alpha=1.0, a=buf_mat_A, b=mat_B) calc_time = MPI.Wtime() - calc_start fh_C = MPI.File.Open(comm, f"mat_C/mat_C_{mat_size}_{iteration}.txt", amode_C) filetype = MPI.FLOAT.Create_vector(j_size, i_size, mat_size) filetype.Commit() offset_C = (mat_size * i_coord * i_size + j_coord * j_size) * MPI.FLOAT.Get_size() fh_C.Set_view(offset_C, filetype=filetype) fh_C.Write_all(buf_mat_C) filetype.Free() fh_C.Close() total_time = MPI.Wtime() - t_start
################################################################################ func = 'posv' for prefix in ['s', 'd']: funcname = prefix + func dtype = get_dtype(funcname) m = 8192 n = 100 a = np.random.uniform(size=m * m).reshape((m, m)).astype(dtype) b = np.ones((m, n), dtype=dtype) alpha = 1. if dtype == np.float32: c = bl.sgemm(alpha, a, b) elif dtype == np.float64: c = bl.dgemm(alpha, a, b) get_time(funcname, [a, c], df) ################################################################################ func = 'potrf' for prefix in ['s', 'd']: funcname = prefix + func dtype = get_dtype(funcname) m = 8192 a = np.random.uniform(size=m * m).reshape((m, m)).astype(dtype)
#Checking whether the runtime of LAPACK SGEMM will be drastically changed #if the NUmpy arrays are type casted as single point precision when being defined import numpy as np import time from scipy.linalg import blas as FB mat_size = 8192 mat_A = np.random.rand(mat_size, mat_size) mat_B = np.random.rand(mat_size, mat_size) t0 = time.time() mat_C = FB.sgemm(alpha=1.0, a=mat_A, b=mat_B) t1 = time.time() mat_A_new = mat_A.astype(np.float32) mat_B_new = mat_B.astype(np.float32) t2 = time.time() mat_C_new = FB.sgemm(alpha=1.0, a=mat_A_new, b=mat_B_new) t3 = time.time() print(t1 - t0) print(t3 - t2)
import numpy as np from scipy.linalg import blas as FB import sys #Reads the Matrix Size from the command line mat_size = int(sys.argv[1]) iteration = int(sys.argv[2]) mat_A = np.loadtxt(f"mat_A/mat_A_{mat_size}_{iteration}.txt") mat_B = np.loadtxt(f"mat_B/mat_B_{mat_size}_{iteration}.txt") mat_B = np.transpose(mat_B) answer = FB.sgemm(alpha=1.0, a=mat_A, b=mat_B) res = np.loadtxt(f"mat_C/mat_C_{mat_size}_{iteration}.txt") print(np.allclose(answer, res))
dest=i, tag=25) #Defines the matrices that the master core will be operating on sub_mat_A = mat_A[displ_A[0]:displ_A[0] + len_i] sub_mat_B = mat_B[displ_B[0]:displ_B[0] + len_j] else: #Every worker core receives their submatrices of A,B comm.Recv([sub_mat_A, MPI.FLOAT], source=0) comm.Recv([sub_mat_B, MPI.FLOAT], source=0) #Starts the timer for the beginning of the "calculation" portion comm.Barrier() calc_start = MPI.Wtime() #Each core calculates their submatrix C' using sgemm. In the handwritten version of this, the "matrix_mult" function will be called instead sub_mat_C = FB.sgemm(alpha=1.0, a=sub_mat_A, b=sub_mat_B, trans_b=True) #Stops the timer for the "calculation" portion, starting the timer for the "gather" portion comm.Barrier() calc_finish = MPI.Wtime() #Creates an empty matrix for the submatrices C' to be gathered into mat_C = None if rank == 0: mat_C = np.empty(mat_size * mat_size, dtype=np.float32) #Gathers all of the submatrices C' count_C = [len_i * len_j for _ in range(size)] displ_C = [len_i * len_j * list_rank for list_rank in range(size)] sub_mat_C = np.ascontiguousarray(sub_mat_C, dtype=np.float32) comm.Gatherv(sub_mat_C, [mat_C, count_C, displ_C, MPI.FLOAT], root=0)
print("Enter matrix size, m x n x l") m = int(input("\n")) n = int(input("\n")) l = int(input("\n")) a = np.random.random((m, l)).astype('float32') b = np.identity((l)).astype('float32') c = np.zeros((m, n)).astype('float32') itermax = 10 ts = time.time() for iteration in range(itermax): c = blas.sgemm(1.0, a, b) #c = blas.sgemm(1.0,a,b) te = time.time() duration = te - ts flops = 2.0 * (np.double(m) * np.double(n) * np.double(l)) - (np.double(m) * np.double(n)) gflops = (itermax * flops / duration) * 1.0e-9 print("c") print(c) print("a") print(a)
def check_answer(mat_A,mat_B,mat_C): answer = FB.sgemm(alpha=1.0, a=mat_A, b=mat_B) #rounded_answer = np.around(answer,decimals=5) #rounded_mat_C = np.around(mat_C,decimals=5) return np.allclose(answer,mat_C)
def test(model, queryloader, galleryloader, pool, use_gpu, ranks=[1, 5, 10, 20]): with torch.no_grad(): model.eval() qf, q_pids, q_camids = [], [], [] for batch_idx, (imgs, pids, camids) in tqdm(enumerate(queryloader), total=len(queryloader)): if use_gpu: imgs = imgs.cuda() # imgs = Variable(imgs, volatile=True) # b=1, n=number of clips, s=16 b, n, s, c, h, w = imgs.size() assert (b == 1) imgs = imgs.view(b * n, s, c, h, w) features = model(imgs) features = features.view(n, -1) features = torch.mean(features, 0) features = features.data.cpu().numpy() qf.append(features) q_pids.extend(pids) q_camids.extend(camids) if batch_idx % 20 == 0: gc.collect() qf = np.asarray(qf, dtype=np.float32) q_pids = np.asarray(q_pids) q_camids = np.asarray(q_camids) gc.collect() print("Extracted features for query set, obtained {}-by-{} matrix".format(qf.shape[0], qf.shape[1])) gf, g_pids, g_camids = [], [], [] for batch_idx, (imgs, pids, camids) in tqdm(enumerate(galleryloader), total=len(galleryloader)): if use_gpu: imgs = imgs.cuda() # imgs = Variable(imgs, volatile=True) b, n, s, c, h, w = imgs.size() imgs = imgs.view(b * n, s, c, h, w) assert (b == 1) features = model(imgs) features = features.view(n, -1) if pool == 'avg': features = torch.mean(features, 0) else: features, _ = torch.max(features, 0) features = features.data.cpu().numpy() gf.append(features) g_pids.extend(pids) g_camids.extend(camids) if batch_idx % 20 == 0: gc.collect() gf = np.asarray(gf, dtype=np.float32) g_pids = np.asarray(g_pids) g_camids = np.asarray(g_camids) gc.collect() print("Extracted features for gallery set, obtained {}-by-{} matrix".format(gf.shape[0], gf.shape[1])) print("Computing distance matrix") m, n = qf.shape[0], gf.shape[0] distmat = np.tile(np.sum(np.power(qf, 2), axis=1, keepdims=True), (1, n)) + \ np.tile(np.sum(np.power(gf, 2), axis=1, keepdims=True), (1, m)).T distmat -= 2 * blas.sgemm(1, qf, gf.T) # distmat = np.power(qf, 2).sum(dim=1, keepdim=True).expand(m, n) + \ # torch.pow(gf, 2).sum(dim=1, keepdim=True).expand(n, m).t() # distmat.addmm_(1, -2, qf, gf.t()) # distmat = distmat.numpy() print("Computing CMC and mAP") cmc, mAP = evaluate(distmat, q_pids, g_pids, q_camids, g_camids) print("Results ----------") print("mAP: {:.1%}".format(mAP)) print("CMC curve") for r in ranks: print("Rank-{:<3}: {:.1%}".format(r, cmc[r - 1])) print("------------------") return cmc[0]