def write_stra_matC(myfile, coeff_idx, coeffs, idx, dim_name, dims, level): mat_name = (getName(coeff_idx))[0] nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] #add = 'std::array<unsigned, ' + str(len(nonzero_coeffs)) + '> ' + mat_name + str(idx) + '_subid = {' #add += ', '.join(['%s' % getActualBlockIndex( coeff_idx, i, dims, level ) \ # for i, c in enumerate(coeffs) if is_nonzero(c)]) #add += '};' #write_line(myfile, 1, add) add = 'std::array<T*,' + str( len(nonzero_coeffs)) + '> ' + mat_name + str(idx) + '_list = {' #add += ', '.join( [ str(c) for i, c in enumerate(coeffs) if is_nonzero(c) ] ) add += ', '.join([ 'const_cast<T*>(%s)' % (getBlockName(coeff_idx, i, dims, level)) for i, c in enumerate(coeffs) if is_nonzero(c) ]) add += '};' write_line(myfile, 1, add) add = 'std::array<T,' + str( len(nonzero_coeffs)) + '> ' + mat_name + str(idx) + '_coeff_list = {' add += ', '.join([str(c) for i, c in enumerate(coeffs) if is_nonzero(c)]) add += '};' write_line(myfile, 1, add)
def write_neighbor_visualization_kml(known, k=6, verbose=False): """Write a KML file showing k nearest neighbors among known locations.""" X = known.loc[:, XCOLS] neighbors = NearestNeighbors(n_neighbors=k).fit(X) _, indices = neighbors.kneighbors(X) colors = 'red,orange,yellow,green,cyan,purple'.split(',') with open('../Data/visualize_neighbors.kml', 'wb') as kml: kml.write('<?xml version="1.0" encoding="UTF-8"?>\n') kml.write('<kml xmlns="http://www.opengis.net/kml/2.2">\n') kml.write('<Document>\n') common.write_styles(kml) for i in range(len(known)): kml.write(' <Folder id="%s">\n' % (known.ix[i].ptol_id, )) kml.write(' <name>%s</name>\n' % (known.ix[i].ptol_id, )) ax = known.ix[i, XCOLS].values ay = known.ix[i, YCOLS].values alabel = known.ix[i].ptol_id common.write_point(kml, 'red', ax, alabel) common.write_point(kml, 'red', ay, alabel) if verbose: print known.ix[i].ptol_id, ax, ay points = [indices[i, j] for j in range(1, k)] for m in range(len(points)): j = points[m] bx = known.ix[j, XCOLS].values by = known.ix[j, YCOLS].values blabel = known.ix[j].ptol_id common.write_point(kml, 'yellow', bx, blabel) common.write_point(kml, 'yellow', by, blabel) common.write_line(kml, ax, bx, colors[m]) common.write_line(kml, ay, by, colors[m]) if verbose: print ' ', known.ix[j].ptol_id, bx, by kml.write(' </Folder>\n') kml.write('</Document>\n') kml.write('</kml>\n')
def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 1, comment) add = 'bl_dgemm_straprim_abc%d( ms, ns, ks, ' % index add += ', '.join(['%s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) add += ', lda, ' add += ', '.join(['%s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) add += ', ldb, ' add += ', '.join(['%s' % getBlockName( 2, i, dims, level ) \ for i, c in enumerate(c_coeffs) if is_nonzero(c)]) add += ', ldc, packA, packB, bl_ic_nt );' write_line(myfile, 1, add)
def create_micro_functions(myfile, coeffs, kernel_header_filename): write_line(myfile, 0, '#include "%s"' % kernel_header_filename) write_break(myfile) abc_micro_kernel_gen.write_common_rankk_macro_assembly(myfile) write_break(myfile) abc_micro_kernel_gen.macro_initialize_assembly(myfile) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_xor0_assembly( myfile ) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_loopkiter_assembly( myfile ) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_loopkleft_assembly( myfile ) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_postaccum_assembly( myfile ) write_break(myfile) for i, coeff_set in enumerate(transpose(coeffs[2])): if len(coeff_set) > 0: nonzero_coeffs = [ coeff for coeff in coeff_set if is_nonzero(coeff) ] nnz = len(nonzero_coeffs) if nnz <= 23: abc_micro_kernel_gen.generate_micro_kernel( myfile, nonzero_coeffs, i) write_break(myfile)
def write_updatec_two_assembly( myfile ): #nnz = len( nonzero_coeffs ) nnz = 2 write_line( myfile, 1, '"movq %{0}, %%rax \\n\\t" // load address of alpha_list'.format(nnz+6) ) for j in range( nnz ): #for j, coeff in enumerate(nonzero_coeffs): #print "coeff not 1 / -1!" alpha_avx_reg = get_avx_reg() myfile.write( \ '''\ " \\n\\t" "vbroadcastsd (%%rax), %%{3} \\n\\t" // load alpha_list[ i ] and duplicate "movq %{0}, %%{2} \\n\\t" // load address of c " \\n\\t" "vmovapd 0 * 32(%%{2}), %%{4} \\n\\t" // {4} = c{1}( 0:3, 0 ) "vmulpd %%{3}, %%ymm9, %%{5} \\n\\t" // scale by alpha, {5} = {3}( alpha ) * ymm9( c{1}( 0:3, 0 ) ) "vaddpd %%{4}, %%{5}, %%{4} \\n\\t" // {4} += {5} "vmovapd %%{4}, 0(%%{2}) \\n\\t" // c{1}( 0:3, 0 ) = {4} "vmovapd 1 * 32(%%{2}), %%{6} \\n\\t" // {6} = c{1}( 4:7, 0 ) "vmulpd %%{3}, %%ymm8, %%{7} \\n\\t" // scale by alpha, {7} = {3}( alpha ) * ymm8( c{1}( 4:7, 0 ) ) "vaddpd %%{6}, %%{7}, %%{6} \\n\\t" // {6} += {7} "vmovapd %%{6}, 32(%%{2}) \\n\\t" // c{1}( 4:7, 0 ) = {6} "addq %%rdi, %%{2} \\n\\t" "vmovapd 0 * 32(%%{2}), %%{8} \\n\\t" // {8} = c{1}( 0:3, 1 ) "vmulpd %%{3}, %%ymm11, %%{9} \\n\\t" // scale by alpha, {5} = {3}( alpha ) * ymm11( c{1}( 0:3, 1 ) ) "vaddpd %%{8}, %%{9}, %%{8} \\n\\t" // {8} += {7} "vmovapd %%{8}, 0(%%{2}) \\n\\t" // c{1}( 0:3, 1 ) = {8} "vmovapd 1 * 32(%%{2}), %%{10} \\n\\t" // {10} = c{1}( 4:7, 1 ) "vmulpd %%{3}, %%ymm10, %%{11} \\n\\t" // scale by alpha, {5} = {3}( alpha ) * ymm10( c{1}( 4:7, 1 ) ) "vaddpd %%{10}, %%{11}, %%{10} \\n\\t" // {10} += {9} "vmovapd %%{10}, 32(%%{2}) \\n\\t" // c{1}( 4:7, 1 ) = {10} "addq %%rdi, %%{2} \\n\\t" "vmovapd 0 * 32(%%{2}), %%{12} \\n\\t" // {12} = c{1}( 0:3, 2 ) "vmulpd %%{3}, %%ymm13, %%{13} \\n\\t" // scale by alpha, {5} = {3}( alpha ) * ymm13( c{1}( 0:3, 2 ) ) "vaddpd %%{12}, %%{13}, %%{12} \\n\\t" // {12} += {11} "vmovapd %%{12}, 0(%%{2}) \\n\\t" // c{1}( 0:3, 2 ) = {12} "vmovapd 1 * 32(%%{2}), %%{14} \\n\\t" // {14} = c{1}( 4:7, 2 ) "vmulpd %%{3}, %%ymm12, %%{15} \\n\\t" // scale by alpha, {5} = {3}( alpha ) * ymm12( c{1}( 4:7, 2 ) ) "vaddpd %%{14}, %%{15}, %%{14} \\n\\t" // {14} += {13} "vmovapd %%{14}, 32(%%{2}) \\n\\t" // c{1}( 4:7, 2 ) = {14} "addq %%rdi, %%{2} \\n\\t" "vmovapd 0 * 32(%%{2}), %%{16} \\n\\t" // {16} = c{1}( 0:3, 3 ) "vmulpd %%{3}, %%ymm15, %%{17} \\n\\t" // scale by alpha, {5} = {3}( alpha ) * ymm15( c{1}( 0:3, 3 ) ) "vaddpd %%{16}, %%{17}, %%{16} \\n\\t" // {16} += {15} "vmovapd %%{16}, 0(%%{2}) \\n\\t" // c{1}( 0:3, 3 ) = {16} "vmovapd 1 * 32(%%{2}), %%{18} \\n\\t" // {18} = c{1}( 4:7, 3 ) "vmulpd %%{3}, %%ymm14, %%{19} \\n\\t" // scale by alpha, {5} = {3}( alpha ) * ymm14( c{1}( 4:7, 3 ) ) "vaddpd %%{18}, %%{19}, %%{18} \\n\\t" // {18} +={17} "vmovapd %%{18}, 32(%%{2}) \\n\\t" // c{1}( 4:7, 3 ) = {18} "addq $1 * 8, %%rax \\n\\t" // alpha_list += 8 " \\n\\t" '''.format( str(j+6), str(j), get_reg(), alpha_avx_reg, get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ), get_avx_reg( alpha_avx_reg ) ) )
def write_stra_mat(myfile, coeff_idx, coeffs, idx, dim_name, dims, level): mat_name = (getName(coeff_idx))[0] nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] add = 'stra_matrix_view<T,' + str(len(nonzero_coeffs)) + '> ' add += mat_name + 'v' + str(idx) + '({' + dim_name + '}, {' #add += ', '.join( ['const_cast<T*>(%s)' % (getSubMatName(coeff_idx, i, dims, level) ) for i, c in enumerate(coeffs) if is_nonzero(c)] ) add += ', '.join([ 'const_cast<T*>(%s)' % (getBlockName(coeff_idx, i, dims, level)) for i, c in enumerate(coeffs) if is_nonzero(c) ]) add += '}, {' add += ', '.join([str(c) for i, c in enumerate(coeffs) if is_nonzero(c)]) add += '}, {rs_' + mat_name + ', cs_' + mat_name + '});' write_line(myfile, 1, add)
def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 1, comment) write_stra_mat(myfile, 0, a_coeffs, index, ['AC', 'AB'], dims, level) write_stra_mat(myfile, 1, b_coeffs, index, ['AB', 'BC'], dims, level) write_stra_mat(myfile, 2, c_coeffs, index, ['AC', 'BC'], dims, level) myfile.write( \ '''\ if (Cv{0}.stride(!row_major) == 1) {{ Av{0}.transpose(); Bv{0}.transpose(); Cv{0}.transpose(); stra_gemm(comm, cfg, alpha, Bv{0}, Av{0}, beta, Cv{0}); }} else {{ stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0}); }} '''.format( index ) ) #Av{0}.swap(Bv{0}); #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) #write_line( myfile, 1, add ) write_line(myfile, 1, 'comm.barrier();') write_line( myfile, 1, '//std::cout << "stra_internal/stra_mult_M{0}:" << std::endl;'.format( index)) write_line(myfile, 1, '//print_tensor_matrix( ct );') write_break(myfile)
def write_common_start_assembly(myfile, nnz): myfile.write( \ '''\ void* b_next = bli_auxinfo_next_b( data ); uint64_t k_iter = k / 4; uint64_t k_left = k % 4; ''' ) add = 'double ' add += ', '.join( ['*coeff%d = &coeff_list[%d]' % (i, i) for i in range(nnz)]) add += ';' write_line(myfile, 1, add) add = 'double ' add += ', '.join(['*c%d = c_list[%d]' % (i, i) for i in range(nnz)]) add += ';' write_line(myfile, 1, add) write_break(myfile) myfile.write( \ '''\ __asm__ volatile ( " \\n\\t" " \\n\\t" "movq %[a], %%rax \\n\\t" // load address of a. ( v ) "movq %[b], %%rbx \\n\\t" // load address of b. ( v ) "movq %[b_next], %%r15 \\n\\t" // load address of b_next. ( v ) "addq $-4 * 64, %%r15 \\n\\t" // ( ? ) " \\n\\t" "vmovapd 0 * 32(%%rax), %%ymm0 \\n\\t" // initialize loop by pre-loading "vmovapd 0 * 32(%%rbx), %%ymm2 \\n\\t" // elements of a and b. "vpermilpd $0x5, %%ymm2, %%ymm3 \\n\\t" " \\n\\t" " \\n\\t" "movq %[cs_c], %%rdi \\n\\t" // load cs_c "leaq (,%%rdi,8), %%rdi \\n\\t" // cs_c * sizeof(double) ''' )
def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 1, comment) write_stra_mat(myfile, 0, a_coeffs, index, 'ms, ks', dims, level) write_stra_mat(myfile, 1, b_coeffs, index, 'ks, ns', dims, level) write_stra_mat(myfile, 2, c_coeffs, index, 'ms, ns', dims, level) #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) #add = 'straprim_naive(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) add = 'straprim_ab(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index) write_line(myfile, 1, add) write_line(myfile, 1, 'comm.barrier();') write_break(myfile)
def write_stra_matAB(myfile, coeff_idx, coeffs, idx, dim_name, dims, level): mat_name = (getName(coeff_idx))[0] nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] add = 'std::array<unsigned, ' + str( len(nonzero_coeffs)) + '> ' + mat_name + str(idx) + '_subid = {' add += ', '.join(['%s' % getActualBlockIndex( coeff_idx, i, dims, level ) \ for i, c in enumerate(coeffs) if is_nonzero(c)]) add += '};' write_line(myfile, 1, add) add = 'std::array<T,' + str( len(nonzero_coeffs)) + '> ' + mat_name + str(idx) + '_coeff_list = {' add += ', '.join([str(c) for i, c in enumerate(coeffs) if is_nonzero(c)]) add += '};' write_line(myfile, 1, add) add = 'stra_tensor_view<T,' + str(len(nonzero_coeffs)) + '> ' add += mat_name + 'v' + str(idx) add += '(my_len_' + dim_name[0] + ', ' add += 'my_len_' + dim_name[1] + ', ' add += mat_name + '_divisor, const_cast<T*>(' + mat_name + '), ' add += mat_name + str(idx) + '_subid, ' + mat_name + str( idx) + '_coeff_list, ' add += 'my_stride_' + mat_name + '_' + dim_name[0] + ',' add += ' my_stride_' + mat_name + '_' + dim_name[1] add += ');' write_line(myfile, 1, add)
def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([' %s += %s * M%d' % ( getBlockName( 2, i, dims, level ), c, index ) for i, c in enumerate(c_coeffs) if is_nonzero(c)]) comment += ';' write_line(myfile, 1, comment) write_stra_mat( myfile, 0, a_coeffs, index, ['AC', 'AB'], dims, level ) write_stra_mat( myfile, 1, b_coeffs, index, ['AB', 'BC'], dims, level ) write_stra_mat( myfile, 2, c_coeffs, index, ['AC', 'BC'], dims, level ) myfile.write( \ '''\ //if (ct.stride(!row_major) == 1) //{{ // Av{0}.transpose(); // Bv{0}.transpose(); // Cv{0}.transpose(); // straprim_naive<T,{1},{2},{3}>(comm, cfg, my_sub_len_AB, my_sub_len_AC, my_sub_len_BC, // alpha, // B{0}_list, B{0}_coeff_list, my_stride_B_AB, my_stride_B_BC, // A{0}_list, A{0}_coeff_list, my_stride_A_AB, my_stride_A_AC, // beta, // C{0}_list, C{0}_coeff_list, my_stride_C_AC, my_stride_C_BC); //}} else {{ straprim_naive<T,{1},{2},{3}>(comm, cfg, my_sub_len_AB, my_sub_len_AC, my_sub_len_BC, alpha, A{0}_list, A{0}_coeff_list, my_stride_A_AB, my_stride_A_AC, B{0}_list, B{0}_coeff_list, my_stride_B_AB, my_stride_B_BC, beta, C{0}_list, C{0}_coeff_list, my_stride_C_AC, my_stride_C_BC); //}} '''.format( index, getNNZ(a_coeffs), getNNZ(b_coeffs), getNNZ(c_coeffs) ) ) #Av{0}.swap(Bv{0}); #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) #write_line( myfile, 1, add ) write_line( myfile, 1, 'comm.barrier();' ) write_line( myfile, 1, '//std::cout << "stra_internal/stra_mult_M{0}:" << std::endl;'.format( index ) ) write_line( myfile, 1, '//print_tensor_matrix( ct );' ) write_break( myfile )
def gen_micro_kernel(outfile, nnz): myfile = open(outfile, 'w') #nonzero_coeffs=['1','-1'] #gen_updatec_assembly( myfile ) write_function_name(myfile, getNumberName(nnz)) write_common_start_assembly(myfile, nnz) write_prefetch_assembly(myfile, nnz) #write_line( myfile, 1, 'RANKK_UPDATE( %d )' % index ) #write_common_rankk_assembly( myfile, index ) #write_common_simple_rankk_assembly( myfile, index ) write_common_rankk_assembly(myfile) write_updatec_assembly(myfile, nnz) write_common_end_assembly(myfile, nnz) write_line(myfile, 0, '}')
def write_neighbor_visualization_kml(known, k=6, verbose=False): """Write a KML file showing k nearest neighbors among known locations.""" X = known.loc[:, XCOLS] neighbors = NearestNeighbors(n_neighbors=k).fit(X) _, indices = neighbors.kneighbors(X) colors = "red,orange,yellow,green,cyan,purple".split(",") with open("../Data/visualize_neighbors.kml", "wb") as kml: kml.write('<?xml version="1.0" encoding="UTF-8"?>\n') kml.write('<kml xmlns="http://www.opengis.net/kml/2.2">\n') kml.write("<Document>\n") common.write_styles(kml) for i in range(len(known)): kml.write(' <Folder id="%s">\n' % (known.ix[i].ptol_id,)) kml.write(" <name>%s</name>\n" % (known.ix[i].ptol_id,)) ax = known.ix[i, XCOLS].values ay = known.ix[i, YCOLS].values alabel = known.ix[i].ptol_id common.write_point(kml, "red", ax, alabel) common.write_point(kml, "red", ay, alabel) if verbose: print known.ix[i].ptol_id, ax, ay points = [indices[i, j] for j in range(1, k)] for m in range(len(points)): j = points[m] bx = known.ix[j, XCOLS].values by = known.ix[j, YCOLS].values blabel = known.ix[j].ptol_id common.write_point(kml, "yellow", bx, blabel) common.write_point(kml, "yellow", by, blabel) common.write_line(kml, ax, bx, colors[m]) common.write_line(kml, ay, by, colors[m]) if verbose: print " ", known.ix[j].ptol_id, bx, by kml.write(" </Folder>\n") kml.write("</Document>\n") kml.write("</kml>\n")
def write_divisor_initializer(myfile, dims, level): level_dim = exp_dim(dims, level) write_line( myfile, 1, 'const std::array<unsigned,2> A_divisor={%d,%d};' % (level_dim[0], level_dim[1])) write_line( myfile, 1, 'const std::array<unsigned,2> B_divisor={%d,%d};' % (level_dim[1], level_dim[2])) write_line( myfile, 1, 'const std::array<unsigned,2> C_divisor={%d,%d};' % (level_dim[1], level_dim[2])) write_break(myfile)
def write_updatec_colstored_assembly(myfile, nnz): write_line(myfile, 1, '".DCOLSTORED: \\n\\t"') write_line(myfile, 1, '" \\n\\t"') for j in range(nnz): coeff_avx_reg = get_avx_reg() myfile.write( \ '''\ " \\n\\t" "movq %[coeff{0}], %%{1} \\n\\t" // load address of coeff{0} " \\n\\t" "vbroadcastsd (%%{1}), %%{2} \\n\\t" // load coeff{0} and duplicate " \\n\\t" '''.format( j, get_reg(), coeff_avx_reg ) ) #"leaq (%%rcx,%%rsi,4), %%r10 \\n\\t" // load address of c{0} + 4*rs_c;' c03_ymm_list = ['ymm9', 'ymm11', 'ymm13', 'ymm15'] #c00:c33 c47_ymm_list = ['ymm8', 'ymm10', 'ymm12', 'ymm14'] #c40:c73 for idx in range(4): myfile.write( \ '''\ "vmovapd 0 * 32(%%{3}), %%{5} \\n\\t" // {5} = c{0}( 0:3, 0 ) "vmulpd %%{4}, %%{1}, %%{6} \\n\\t" // scale by coeff{0}, {6} = {4}( coeff{0} ) * {1}( c{0}( 0:3, 0 ) ) "vaddpd %%{5}, %%{6}, %%{5} \\n\\t" // {5} += {6} "vmovapd %%{5}, 0(%%{3}) \\n\\t" // c{0}( 0:3, 0 ) = {5} "vmovapd 1 * 32(%%{3}), %%{7} \\n\\t" // {7} = c{0}( 4:7, 0 ) "vmulpd %%{4}, %%{2}, %%{8} \\n\\t" // scale by coeff{0}, {8} = {4}( coeff{0} ) * {2}( c{0}( 4:7, 0 ) ) "vaddpd %%{7}, %%{8}, %%{7} \\n\\t" // {7} += {8} "vmovapd %%{7}, 32(%%{3}) \\n\\t" // c{0}( 4:7, 0 ) = {7} '''.format(j, c03_ymm_list[idx], c47_ymm_list[idx], get_reg.c2reg[j], coeff_avx_reg, get_avx_reg(coeff_avx_reg), get_avx_reg(coeff_avx_reg), get_avx_reg(coeff_avx_reg), get_avx_reg(coeff_avx_reg) ) ) if (idx != 3): write_line( myfile, 1, '"addq %%rdi, %%{0} \\n\\t"'. format(get_reg.c2reg[j]))
def write_triangle(kml, name, colors, points): for i in range(3): common.write_line(kml, points[i], points[(i + 1) % 3], colors[i])
def write_common_end_assembly(myfile, nnz): write_line(myfile, 1, '" \\n\\t"') write_line(myfile, 1, '".DDONE: \\n\\t"') write_line(myfile, 1, '" \\n\\t"') write_line(myfile, 1, ': // output operands (none)') write_line(myfile, 1, ': // input operands') write_line(myfile, 1, ' [k_iter] "m" (k_iter), // 0') write_line(myfile, 1, ' [k_left] "m" (k_left), // 1') write_line(myfile, 1, ' [a] "m" (a), // 2') write_line(myfile, 1, ' [b] "m" (b), // 3') write_line(myfile, 1, ' [b_next] "m" (b_next), // 4') write_line(myfile, 1, ' [rs_c] "m" (rs_c), // 5') write_line(myfile, 1, ' [cs_c] "m" (cs_c), // 6') add = '' add += '\n '.join([ ' [c%d] "m" (c%d) // %d' % (i, i, i + 7) for i in range(nnz) ]) add += '\n ' add += '\n '.join([ ' [coeff%d] "m" (coeff%d) // %d' % (i, i, i + 7 + nnz) for i in range(nnz) ]) #write_line( myfile, 1, ' "m" (c) // 6' ) write_line(myfile, 1, add) write_line(myfile, 1, ': // register clobber list') write_line(myfile, 1, ' "rax", "rbx", "rcx", "rdx", "rsi", "rdi",') write_line(myfile, 1, ' "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",') write_line(myfile, 1, ' "xmm0", "xmm1", "xmm2", "xmm3",') write_line(myfile, 1, ' "xmm4", "xmm5", "xmm6", "xmm7",') write_line(myfile, 1, ' "xmm8", "xmm9", "xmm10", "xmm11",') write_line(myfile, 1, ' "xmm12", "xmm13", "xmm14", "xmm15",') write_line(myfile, 1, ' "memory"') write_line(myfile, 1, ');')
def write_updatec_genstored_assembly(myfile, nnz): write_line(myfile, 1, '".DGENSTORED: \\n\\t"') write_line(myfile, 1, '" \\n\\t"') write_line( myfile, 1, '"leaq (,%%rsi,2), %%r12 \\n\\t" // r12 = 2*rs_c;' ) write_line( myfile, 1, '"leaq (%%r12,%%rsi,1), %%r13 \\n\\t" // r13 = 3*rs_c;' ) write_line(myfile, 1, '" \\n\\t"') for j in range(nnz): c47_reg = get_reg() coeff_avx_reg = get_avx_reg() #coeff_avx_reg = 'ymm6' myfile.write( \ '''\ "movq %[coeff{0}], %%{3} \\n\\t" // load address of coeff{0} "vbroadcastsd (%%{3}), %%{4} \\n\\t" // load coeff{0} and duplicate "leaq (%%{1},%%rsi,4), %%{2} \\n\\t" // load address of c{0} + 4*rs_c;' " \\n\\t" '''.format( j, get_reg.c2reg[j], c47_reg, get_reg(), coeff_avx_reg ) ) c03_ymm_list = ['ymm9', 'ymm11', 'ymm13', 'ymm15'] #c00:c33 c47_ymm_list = ['ymm8', 'ymm10', 'ymm12', 'ymm14'] #c40:c73 # for idx in range(4): # myfile.write( \ #'''\ # "vextractf128 $1, %%{0}, %%xmm1 \\n\\t" # "vmovlpd (%%{2}), %%xmm0, %%xmm0 \\n\\t" // load c{4}_0{1} and c{4}_1{1}, # "vmovhpd (%%{2},%%rsi), %%xmm0, %%xmm0 \\n\\t" # "vmulpd %%xmm{5}, %%xmm{3}, %%xmm2 \\n\\t" // scale by coeff{4}, # "vaddpd %%xmm2, %%xmm0, %%xmm2 \\n\\t" // add the gemm result, # "vmovlpd %%xmm2, (%%{2}) \\n\\t" // and store back to memory. # "vmovhpd %%xmm2, (%%{2},%%rsi) \\n\\t" # "vmovlpd (%%{2},%%r12), %%xmm0, %%xmm0 \\n\\t" // load c{4}_2{1} and c{4}_3{1}, # "vmovhpd (%%{2},%%r13), %%xmm0, %%xmm0 \\n\\t" # "vmulpd %%xmm{5}, %%xmm1, %%xmm2 \\n\\t" // scale by coeff{4}, # "vaddpd %%xmm2, %%xmm0, %%xmm2 \\n\\t" // add the gemm result, # "vmovlpd %%xmm2, (%%{2},%%r12) \\n\\t" // and store back to memory. # "vmovhpd %%xmm2, (%%{2},%%r13) \\n\\t" # "addq %%rdi, %%{2} \\n\\t" // c += cs_c; # " \\n\\t" #'''.format( c03_ymm_list[idx], str(idx), get_reg.c2reg[j], c03_ymm_list[idx][3:], j, coeff_avx_reg[3:], ) ) for idx in range(4): myfile.write( \ '''\ "vextractf128 $1, %%{0}, %%xmm{7} \\n\\t" "vmovlpd (%%{2}), %%xmm{6}, %%xmm{6} \\n\\t" // load c{4}_0{1} and c{4}_1{1}, "vmovhpd (%%{2},%%rsi), %%xmm{6}, %%xmm{6} \\n\\t" "vmulpd %%xmm{5}, %%xmm{3}, %%xmm{8} \\n\\t" // scale by coeff{4}, "vaddpd %%xmm{8}, %%xmm{6}, %%xmm{8} \\n\\t" // add the gemm result, "vmovlpd %%xmm{8}, (%%{2}) \\n\\t" // and store back to memory. "vmovhpd %%xmm{8}, (%%{2},%%rsi) \\n\\t" "vmovlpd (%%{2},%%r12), %%xmm{6}, %%xmm{6} \\n\\t" // load c{4}_2{1} and c{4}_3{1}, "vmovhpd (%%{2},%%r13), %%xmm{6}, %%xmm{6} \\n\\t" "vmulpd %%xmm{5}, %%xmm{7}, %%xmm{8} \\n\\t" // scale by coeff{4}, "vaddpd %%xmm{8}, %%xmm{6}, %%xmm{8} \\n\\t" // add the gemm result, "vmovlpd %%xmm{8}, (%%{2},%%r12) \\n\\t" // and store back to memory. "vmovhpd %%xmm{8}, (%%{2},%%r13) \\n\\t" " \\n\\t" '''.format( c03_ymm_list[idx], str(idx), get_reg.c2reg[j], c03_ymm_list[idx][3:], j, coeff_avx_reg[3:], (get_avx_reg(avoid_reg=coeff_avx_reg))[3:], (get_avx_reg(avoid_reg=coeff_avx_reg))[3:], (get_avx_reg(avoid_reg=coeff_avx_reg))[3:], ) ) if (idx != 3): write_line( myfile, 1, '"addq %%rdi, %%{0} \\n\\t" // c += cs_c;' .format(get_reg.c2reg[j])) # for idx in range(4): # myfile.write( \ #'''\ # "vextractf128 $1, %%{0}, %%xmm1 \\n\\t" # "vmovlpd (%%{2}), %%xmm0, %%xmm0 \\n\\t" // load c{4}_4{1} and c{4}_5{1}, # "vmovhpd (%%{2},%%rsi), %%xmm0, %%xmm0 \\n\\t" # "vmulpd %%xmm{5}, %%xmm{3}, %%xmm2 \\n\\t" // scale by coeff{4}, # "vaddpd %%xmm2, %%xmm0, %%xmm2 \\n\\t" // add the gemm result, # "vmovlpd %%xmm2, (%%{2}) \\n\\t" // and store back to memory. # "vmovhpd %%xmm2, (%%{2},%%rsi) \\n\\t" # "vmovlpd (%%{2},%%r12), %%xmm0, %%xmm0 \\n\\t" // load c{4}_6{1} and c{4}_7{1}, # "vmovhpd (%%{2},%%r13), %%xmm0, %%xmm0 \\n\\t" # "vmulpd %%xmm{5}, %%xmm1, %%xmm2 \\n\\t" // scale by coeff{4}, # "vaddpd %%xmm2, %%xmm0, %%xmm2 \\n\\t" // add the gemm result, # "vmovlpd %%xmm2, (%%{2},%%r12) \\n\\t" // and store back to memory. # "vmovhpd %%xmm2, (%%{2},%%r13) \\n\\t" # "addq %%rdi, %%{2} \\n\\t" // c += cs_c; # " \\n\\t" #'''.format( c47_ymm_list[idx], str(idx), c47_reg, c47_ymm_list[idx][3:], j, coeff_avx_reg[3:], ) ) for idx in range(4): myfile.write( \ '''\ "vextractf128 $1, %%{0}, %%xmm{7} \\n\\t" "vmovlpd (%%{2}), %%xmm{6}, %%xmm{6} \\n\\t" // load c{4}_4{1} and c{4}_5{1}, "vmovhpd (%%{2},%%rsi), %%xmm{6}, %%xmm{6} \\n\\t" "vmulpd %%xmm{5}, %%xmm{3}, %%xmm{8} \\n\\t" // scale by coeff{4}, "vaddpd %%xmm{8}, %%xmm{6}, %%xmm{8} \\n\\t" // add the gemm result, "vmovlpd %%xmm{8}, (%%{2}) \\n\\t" // and store back to memory. "vmovhpd %%xmm{8}, (%%{2},%%rsi) \\n\\t" "vmovlpd (%%{2},%%r12), %%xmm{6}, %%xmm{6} \\n\\t" // load c{4}_6{1} and c{4}_7{1}, "vmovhpd (%%{2},%%r13), %%xmm{6}, %%xmm{6} \\n\\t" "vmulpd %%xmm{5}, %%xmm{7}, %%xmm{8} \\n\\t" // scale by coeff{4}, "vaddpd %%xmm{8}, %%xmm{6}, %%xmm{8} \\n\\t" // add the gemm result, "vmovlpd %%xmm{8}, (%%{2},%%r12) \\n\\t" // and store back to memory. "vmovhpd %%xmm{8}, (%%{2},%%r13) \\n\\t" " \\n\\t" '''.format( c47_ymm_list[idx], str(idx), c47_reg, c47_ymm_list[idx][3:], j, coeff_avx_reg[3:], (get_avx_reg(avoid_reg=coeff_avx_reg))[3:], (get_avx_reg(avoid_reg=coeff_avx_reg))[3:], (get_avx_reg(avoid_reg=coeff_avx_reg))[3:], ) ) if (idx != 3): write_line( myfile, 1, '"addq %%rdi, %%{0} \\n\\t" // c += cs_c;' .format(c47_reg)) write_line(myfile, 1, '" \\n\\t"') write_line( myfile, 1, '"jmp .DDONE \\n\\t" // jump to end.') write_line(myfile, 1, '" \\n\\t"')
def write_mulstrassen_kernel_caller(myfile, nonzero_coeffs): nnz = len(nonzero_coeffs) write_line(myfile, 3, 'double alpha_list[%d];' % nnz) write_line(myfile, 3, 'double *c_list[%d];' % nnz) write_line(myfile, 3, 'unsigned long long len_c=%d;' % nnz) add = '; '.join([ 'alpha_list[%d]= (double)(%s)' % (j, coeff) for j, coeff in enumerate(nonzero_coeffs) ]) add += ';' write_line(myfile, 3, add) add = '; '.join([ 'c_list[%d] = &C%d[ j * ldC + i ]' % (j, j) for j, coeff in enumerate(nonzero_coeffs) ]) add += ';' write_line(myfile, 3, add) write_line( myfile, 3, '( bl_dgemm_asm_8x4_mulstrassen ) ( k, &packA[ i * k ], &packB[ j * k ], (unsigned long long) len_c, (unsigned long long) ldC, c_list, alpha_list, &aux );' )
def write_triangle(kml, name, colors, points): for i in range(3): common.write_line(kml, points[i], points[(i+1) % 3], colors[i])
def write_macro_func(myfile, coeffs, index, mat_name): ''' Write the add function for a set of coefficients. This is a custom add function used for a single multiply in a single fast algorithm. coeffs is the set of coefficients used for the add ''' nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] nnz = len(nonzero_coeffs) # TODO(arbenson): put in a code-generated comment here add = 'inline void bl_macro_kernel_stra_abc%d( int m, int n, int k, double *packA, double *packB, ' % ( index) add += ', '.join(['double *%s%d' % (mat_name, i) for i in range(nnz)]) add += ', int ld%s ) {' % (mat_name) write_line(myfile, 0, add) write_line(myfile, 1, 'int i, j;') write_line(myfile, 1, 'aux_t aux;') write_line(myfile, 1, 'aux.b_next = packB;') write_line(myfile, 1, 'for ( j = 0; j < n; j += DGEMM_NR ) {') write_line(myfile, 1, ' aux.n = min( n - j, DGEMM_NR );') write_line(myfile, 1, ' for ( i = 0; i < m; i += DGEMM_MR ) {') write_line(myfile, 1, ' aux.m = min( m - i, DGEMM_MR );') write_line(myfile, 1, ' if ( i + DGEMM_MR >= m ) {') write_line(myfile, 1, ' aux.b_next += DGEMM_NR * k;') write_line(myfile, 1, ' }') #NEED to do: c_coeff -> pass in the parameters! #Generate the micro-kernel outside #abc_micro_kernel_gen.generate_kernel_header( my_kernel_header_file, nonzero_coeffs, index ) #abc_micro_kernel_gen.generate_micro_kernel( my_micro_kernel_file, nonzero_coeffs, index ) #generate the function caller #if nnz <= 23 and not contain_nontrivial( nonzero_coeffs ): # add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index # add += '(unsigned long long) ld%s, ' % mat_name # add += ', '.join( ['&%s%d[ j * ld%s + i ]' % ( mat_name, i, mat_name ) for i in range( nnz )] ) # add += ', &aux );' # write_line(myfile, 3, add) #else: # write_mulstrassen_kernel_caller( myfile, nonzero_coeffs ) if nnz <= 23: if not contain_nontrivial(nonzero_coeffs): add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index add += '(unsigned long long) ld%s, ' % mat_name add += ', '.join([ '&%s%d[ j * ld%s + i ]' % (mat_name, i, mat_name) for i in range(nnz) ]) add += ', &aux );' write_line(myfile, 3, add) else: write_line(myfile, 3, 'double alpha_list[%d];' % nnz) add = '; '.join([ 'alpha_list[%d]= (double)(%s)' % (j, coeff) for j, coeff in enumerate(nonzero_coeffs) ]) add += ';' write_line(myfile, 3, add) add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index add += '(unsigned long long) ld%s, ' % mat_name add += ', '.join([ '&%s%d[ j * ld%s + i ]' % (mat_name, i, mat_name) for i in range(nnz) ]) add += ', alpha_list , &aux );' write_line(myfile, 3, add) else: write_mulstrassen_kernel_caller(myfile, nonzero_coeffs) #write_mulstrassen_kernel_caller( myfile, nonzero_coeffs ) write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '}') # end of function
def gen_abc_fmm(coeff_filename, dims, level, outfilename, micro_kernel_filename, kernel_header_filename): coeffs = read_coeffs(coeff_filename) #print coeffs #print coeffs[0][0] #coeffs2= [ transpose( U2 ), transpose( V2 ), transpose( W2 ) ] with open(outfilename, 'w') as myfile: write_line(myfile, 0, '#include "%s"' % kernel_header_filename[10:]) write_line(myfile, 0, '#include "bl_dgemm.h"') write_break(myfile) cur_coeffs = generateCoeffs(coeffs, level) #writeCoeffs( cur_coeffs ) #writeEquation( cur_coeffs, dims, level ) num_multiplies = len(cur_coeffs[0][0]) create_packm_functions(myfile, cur_coeffs) my_micro_file = open(micro_kernel_filename, 'w') create_micro_functions(my_micro_file, cur_coeffs, kernel_header_filename[10:]) my_kernel_header = open(kernel_header_filename, 'w') create_kernel_header(my_kernel_header, cur_coeffs) create_macro_functions(myfile, cur_coeffs) create_straprim_abc_functions(myfile, cur_coeffs, dims, level) write_line( myfile, 0, 'void bl_dgemm_strassen_abc( int m, int n, int k, double *XA, int lda, double *XB, int ldb, double *XC, int ldc )' ) write_line(myfile, 0, '{') write_abc_strassen_header(myfile) writePartition(myfile, dims, level) write_break(myfile) write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 1, '#pragma omp parallel num_threads( bl_ic_nt )') write_line(myfile, 0, '#endif') write_line(myfile, 1, '{') create_straprim_caller(myfile, cur_coeffs, dims, num_multiplies, level) write_line(myfile, 1, '}') write_break(myfile) level_dim = exp_dim(dims, level) write_line( myfile, 1, 'bl_dynamic_peeling( m, n, k, XA, lda, XB, ldb, XC, ldc, %d * DGEMM_MR, %d, %d * DGEMM_NR );' % (level_dim[0], level_dim[1], level_dim[2])) write_break(myfile) write_line(myfile, 1, '//free( packA );') write_line(myfile, 1, '//free( packB );') write_line(myfile, 0, '}')
def write_abc_strassen_header(myfile): write_line(myfile, 1, 'double *packA, *packB;') write_break(myfile) write_line(myfile, 1, 'int bl_ic_nt = bl_read_nway_from_env( "BLISLAB_IC_NT" );') write_break(myfile) write_line(myfile, 1, '//// Allocate packing buffers') write_line( myfile, 1, '//packA = bl_malloc_aligned( DGEMM_KC, ( DGEMM_MC + 1 ) * bl_ic_nt, sizeof(double) );' ) write_line( myfile, 1, '//packB = bl_malloc_aligned( DGEMM_KC, ( DGEMM_NC + 1 ) , sizeof(double) );' ) write_line(myfile, 1, 'bl_malloc_packing_pool( &packA, &packB, n, bl_ic_nt );') write_break(myfile)
def write_straprim_abc_function(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, level): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 0, comment) add = 'void bl_dgemm_straprim_abc%d( int m, int n, int k, ' % index add += ', '.join( ['double* %s%d' % ('a', i) for i in range(getNNZ(a_coeffs))]) add += ', int lda, ' add += ', '.join( ['double* %s%d' % ('b', i) for i in range(getNNZ(b_coeffs))]) add += ', int ldb, ' add += ', '.join( ['double* %s%d' % ('c', i) for i in range(getNNZ(c_coeffs))]) add += ', int ldc, double *packA, double *packB, int bl_ic_nt ) {' write_line(myfile, 0, add) write_line(myfile, 1, 'int i, j, p, ic, ib, jc, jb, pc, pb;') write_line(myfile, 1, 'for ( jc = 0; jc < n; jc += DGEMM_NC ) {') write_line(myfile, 2, 'jb = min( n - jc, DGEMM_NC );') write_line(myfile, 2, 'for ( pc = 0; pc < k; pc += DGEMM_KC ) {') write_line(myfile, 3, 'pb = min( k - pc, DGEMM_KC );') #write_line( myfile, 0, '#ifdef _PARALLEL_') #write_line( myfile, 3, '#pragma omp parallel for num_threads( bl_ic_nt ) private( j )' ) #write_line( myfile, 0, '#endif') write_line(myfile, 3, '{') write_line(myfile, 4, 'int tid = omp_get_thread_num();') write_line(myfile, 4, 'int my_start;') write_line(myfile, 4, 'int my_end;') write_line(myfile, 4, 'bl_get_range( jb, DGEMM_NR, &my_start, &my_end );') write_line(myfile, 4, 'for ( j = my_start; j < my_end; j += DGEMM_NR ) {') add = 'packB_add_stra_abc%d( min( jb - j, DGEMM_NR ), pb, ' % index add += ', '.join([ '&%s%d[ pc + (jc+j)*ldb ]' % ('b', i) for i in range(getNNZ(b_coeffs)) ]) add += ', ldb, &packB[ j * pb ] );' write_line(myfile, 5, add) write_line(myfile, 4, '}') write_line(myfile, 3, '}') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 0, '#pragma omp barrier') write_line(myfile, 0, '#endif') #write_line( myfile, 0, '#ifdef _PARALLEL_') #write_line( myfile, 3, '#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i )' ) #write_line( myfile, 0, '#endif') write_line(myfile, 3, '{') #write_line( myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 4, 'int tid = omp_get_thread_num();') write_line(myfile, 4, 'int my_start;') write_line(myfile, 4, 'int my_end;') write_line(myfile, 4, 'bl_get_range( m, DGEMM_MR, &my_start, &my_end );') #write_line( myfile, 0, '#else') #write_line( myfile, 4, 'int tid = 0;' ) #write_line( myfile, 4, 'int my_start = 0;' ) #write_line( myfile, 4, 'int my_end = m;' ) #write_line( myfile, 0, '#endif') write_line(myfile, 4, 'for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {') write_line(myfile, 5, 'ib = min( my_end - ic, DGEMM_MC );') write_line(myfile, 5, 'for ( i = 0; i < ib; i += DGEMM_MR ) {') add = 'packA_add_stra_abc%d( min( ib - i, DGEMM_MR ), pb, ' % index add += ', '.join([ '&%s%d[ pc*lda + (ic+i) ]' % ('a', i) for i in range(getNNZ(a_coeffs)) ]) add += ', lda, &packA[ tid * DGEMM_MC * pb + i * pb ] );' write_line(myfile, 6, add) write_line(myfile, 5, '}') add = 'bl_macro_kernel_stra_abc%d( ib, jb, pb, packA + tid * DGEMM_MC * pb, packB, ' % index add += ', '.join( ['&%s%d[ jc * ldc + ic ]' % ('c', i) for i in range(getNNZ(c_coeffs))]) add += ', ldc );' write_line(myfile, 5, add) write_line(myfile, 4, '}') write_line(myfile, 3, '}') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 0, '#pragma omp barrier') write_line(myfile, 0, '#endif') write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 0, '#pragma omp barrier') write_line(myfile, 0, '#endif') write_line(myfile, 0, '}') write_break(myfile)
def write_packm_func(myfile, coeffs, index, mat_name): ''' Write the add function for a set of coefficients. This is a custom add function used for a single multiply in a single fast algorithm. coeffs is the set of coefficients used for the add ''' nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] nnz = len(nonzero_coeffs) # TODO(arbenson): put in a code-generated comment here add = 'inline void pack%s_add_stra_abc%d( int m, int n, ' % (mat_name, index) add += ', '.join(['double *%s%d' % (mat_name, i) for i in range(nnz)]) add += ', int ld%s, double *pack%s ' % (mat_name, mat_name) add += ') {' write_line(myfile, 0, add) write_line(myfile, 1, 'int i, j;') add = 'double ' add += ', '.join(['*%s%d_pntr' % (mat_name, i) for i in range(nnz)]) add += ', *pack%s_pntr;' % mat_name write_line(myfile, 1, add) if (mat_name == 'A'): ldp = 'DGEMM_MR' incp = '1' ldm = 'ld%s' % mat_name incm = '1' elif (mat_name == 'B'): ldp = 'DGEMM_NR' incp = '1' ldm = '1' incm = 'ld%s' % mat_name else: print "Wrong mat_name!" #ldp = 'DGEMM_MR' if (mat_name == 'A') else 'DGEMM_NR' write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {') write_line(myfile, 2, 'pack%s_pntr = &pack%s[ %s * j ];' % (mat_name, mat_name, ldp)) if ldm == '1': add = ''.join([ '%s%d_pntr = &%s%d[ j ]; ' % (mat_name, i, mat_name, i) for i in range(nnz) ]) else: add = ''.join([ '%s%d_pntr = &%s%d[ %s * j ]; ' % (mat_name, i, mat_name, i, ldm) for i in range(nnz) ]) write_line(myfile, 2, add) write_line(myfile, 2, 'for ( i = 0; i < %s; ++i ) {' % ldp) add = 'pack%s_pntr[ i ]' % mat_name + ' =' for j, coeff in enumerate(nonzero_coeffs): ind = j add += arith_expression_pntr(coeff, mat_name, ind, incm) add += ';' write_line(myfile, 3, add) write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '}') # end of function