def write_multiply_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims_mix, num_multiplies ): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([' %s += %s * M%d' % ( getBlockName( 2, i, dims_mix ), c, index ) for i, c in enumerate(c_coeffs) if is_nonzero(c)]) comment += ';' write_line(myfile, 1, comment) add = 'bl_dgemm_straprim_naive%d( ms, ns, ks, ' % index add += ', '.join(['%s' % getBlockName( 0, i, dims_mix ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) add += ', lda, ' add += ', '.join(['%s' % getBlockName( 1, i, dims_mix ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) add += ', ldb, ' add += ', '.join(['%s' % getBlockName( 2, i, dims_mix ) \ for i, c in enumerate(c_coeffs) if is_nonzero(c)]) add += ', ldc, bl_ic_nt );' write_line( myfile, 1, add )
def write_M_add_func(myfile, coeffs, index, mat_name): nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] nnz = len(nonzero_coeffs) add = 'void %s_Add%d( int m, int n, ' % (mat_name, index) add += ', '.join(['double* %s%d' % (mat_name, i) for i in range(nnz)]) add += ', int ld%s, double* R, int ldR, int bl_ic_nt ' % (mat_name) # Handle the C := alpha A * B + beta C is_output = (mat_name == 'M') #is_output = False #if is_output: # add += ', double beta' add += ') {' write_line(myfile, 0, add) # Handle the C := alpha A * B + beta C write_line(myfile, 1, 'int i, j;') #write_line( myfile, 1, '#pragma omp parallel for schedule( dynamic )' ) write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 1, '#pragma omp parallel for num_threads( bl_ic_nt )') write_line(myfile, 0, '#endif') write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {') write_line(myfile, 2, 'for ( i = 0; i < m; ++i ) {') for j, coeff in enumerate(nonzero_coeffs): ind = j add = data_access(mat_name, str(ind)) + ' += ' add += arith_expression(coeff, 'R', '') add += ';' write_line(myfile, 3, add) write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '}') # end of function
def addition_str(coeffs, coeff_index, mat_name, tmp_name, dims_mix): tmp_mat = '%s%d' % (tmp_name, index) add = '%s_Add%d( %s' % (tmp_name, index, para_ld(coeff_index)) for i, coeff in enumerate(coeffs): if is_nonzero(coeff): add += getBlockName(coeff_index, i, dims_mix) + ', ' add += 'ld%s, ' % (mat_name) add += tmp_mat + ', ld%s, bl_ic_nt );' % tmp_name return add
def write_packm_func( myfile, coeffs, index, mat_name ): ''' Write the add function for a set of coefficients. This is a custom add function used for a single multiply in a single fast algorithm. coeffs is the set of coefficients used for the add ''' nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] nnz = len( nonzero_coeffs ) add = 'inline void pack%s_add_stra_abc%d( int m, int n, ' % (mat_name, index) add += ', '.join(['double *%s%d' % ( mat_name, i ) for i in range(nnz)]) add += ', int ld%s, double *pack%s ' % (mat_name, mat_name) add += ') {' write_line(myfile, 0, add) write_line( myfile, 1, 'int i, j;' ) add = 'double ' add += ', '.join(['*%s%d_pntr' % ( mat_name, i ) for i in range(nnz)]) add += ', *pack%s_pntr;' % mat_name write_line( myfile, 1, add ) if ( mat_name == 'A' ): ldp = 'DGEMM_MR' incp = '1' ldm = 'ld%s' % mat_name incm = '1' elif ( mat_name == 'B' ): ldp = 'DGEMM_NR' incp = '1' ldm = '1' incm = 'ld%s' % mat_name else: print ("Wrong mat_name!") #ldp = 'DGEMM_MR' if (mat_name == 'A') else 'DGEMM_NR' write_line( myfile, 1, 'for ( j = 0; j < n; ++j ) {' ) write_line( myfile, 2, 'pack%s_pntr = &pack%s[ %s * j ];' % (mat_name, mat_name, ldp) ) if ldm == '1': add = ''.join(['%s%d_pntr = &%s%d[ j ]; ' % ( mat_name, i, mat_name, i ) for i in range(nnz)]) else: add = ''.join(['%s%d_pntr = &%s%d[ %s * j ]; ' % ( mat_name, i, mat_name, i, ldm ) for i in range(nnz)]) write_line( myfile, 2, add ) write_line( myfile, 2, 'for ( i = 0; i < %s; ++i ) {' % ldp ) add = 'pack%s_pntr[ i ]' % mat_name + ' =' for j, coeff in enumerate(nonzero_coeffs): ind = j add += arith_expression_pntr(coeff, mat_name, ind, incm ) add += ';' write_line(myfile, 3, add) write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '}') # end of function
def create_kernel_header( myfile, coeffs ): write_break( myfile ) abc_micro_kernel_gen.write_header_start( myfile ) for i, coeff_set in enumerate( transpose( coeffs[2] ) ): if len( coeff_set ) > 0: nonzero_coeffs = [coeff for coeff in coeff_set if is_nonzero(coeff)] nnz = len( nonzero_coeffs ) abc_micro_kernel_gen.generate_kernel_header( myfile, nonzero_coeffs, i ) write_break( myfile ) abc_micro_kernel_gen.write_header_end( myfile )
def write_output_add(myfile, index, coeffs, dims, rank): add = 'M_Add%d( ' % (index) add += 'ms, ns, ' for i, coeff in enumerate(coeffs): if is_nonzero(coeff): suffix = i #if suffix > rank: # suffix = '_X%d' % (suffix - rank) add += 'M%s, ' % suffix add += 'ldM, ' #output_mat = getBlockName( 2, index, dims, level ) output_mat = getBlockName(2, index, dims) add += '%s, ldc, bl_ic_nt );' % output_mat write_line(myfile, 1, add)
def create_micro_functions( myfile, coeffs, kernel_header_filename ): write_line( myfile, 0, '#include "%s"' % kernel_header_filename ) write_break( myfile ) abc_micro_kernel_gen.write_common_rankk_macro_assembly( myfile ) write_break( myfile ) abc_micro_kernel_gen.macro_initialize_assembly( myfile ) write_break( myfile ) for i, coeff_set in enumerate( transpose( coeffs[2] ) ): if len( coeff_set ) > 0: nonzero_coeffs = [coeff for coeff in coeff_set if is_nonzero(coeff)] nnz = len( nonzero_coeffs ) if nnz <= 23: abc_micro_kernel_gen.generate_micro_kernel( myfile, nonzero_coeffs, i ) write_break( myfile )
def write_add_func(myfile, coeffs, index, mat_name): ''' Write the add function for a set of coefficients. This is a custom add function used for a single multiply in a single fast algorithm. coeffs is the set of coefficients used for the add ''' nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] nnz = len(nonzero_coeffs) add = 'void %s_Add%d( int m, int n, ' % (mat_name, index) add += ', '.join(['double* %s%d' % (mat_name, i) for i in range(nnz)]) add += ', int ld%s, double* R, int ldR, int bl_ic_nt ' % (mat_name) # Handle the C := alpha A * B + beta C is_output = (mat_name == 'M') #is_output = False #if is_output: # add += ', double beta' add += ') {' write_line(myfile, 0, add) # Handle the C := alpha A * B + beta C if is_output: #write_line( myfile, 1, 'int i, j;' ) #write_line( myfile, 1, 'for ( j = 0; j < n; ++j ) {') #write_line( myfile, 2, 'for ( i = 0; i < m; ++i ) {') #add = data_access('R') + ' =' #for j, coeff in enumerate(nonzero_coeffs): # ind = j # add += arith_expression(coeff, mat_name, ind ) #add += ' + %s;' % (data_access('R')) #write_line(myfile, 3, add) #write_line(myfile, 2, '}') #write_line(myfile, 1, '}') write_line(myfile, 1, 'int i, j;') #write_line( myfile, 1, '#pragma omp parallel for schedule( dynamic )' ) write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 1, '#pragma omp parallel for num_threads( bl_ic_nt )') write_line(myfile, 0, '#endif') write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {') write_line(myfile, 2, 'for ( i = 0; i < m; ++i ) {') for j, coeff in enumerate(nonzero_coeffs): ind = j add = data_access(mat_name, str(ind)) + ' += ' add += arith_expression(coeff, 'R', '') add += ';' write_line(myfile, 3, add) write_line(myfile, 2, '}') write_line(myfile, 1, '}') #write_line( myfile, 1, 'int i, j;' ) #for j, coeff in enumerate(nonzero_coeffs): # write_line( myfile, 1, 'for ( j = 0; j < n; ++j ) {') # write_line( myfile, 2, 'for ( i = 0; i < m; ++i ) {') # ind = j # add = data_access( mat_name, str(ind) ) + ' += ' # add += arith_expression(coeff, 'R', '' ) # add += ';' # write_line(myfile, 3, add) # write_line(myfile, 2, '}') # write_line(myfile, 1, '}') else: write_line(myfile, 1, 'int i, j;') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 1, '#pragma omp parallel for num_threads( bl_ic_nt )') write_line(myfile, 0, '#endif') write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {') write_line(myfile, 2, 'for ( i = 0; i < m; ++i ) {') add = data_access('R') + ' =' for j, coeff in enumerate(nonzero_coeffs): ind = j add += arith_expression(coeff, mat_name, ind) add += ';' write_line(myfile, 3, add) write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '}') # end of function
def subblock_name(coeffs, coeff_index, mat_name, tmp_name, dims_mix): if need_tmp_mat(coeffs): return '%s%d' % (tmp_name, index) else: loc = [i for i, c in enumerate(coeffs) if is_nonzero(c)] return getBlockName(coeff_index, loc[0], dims_mix)
def write_straprim_naive_function(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims_mix, num_multiplies): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims_mix), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 0, comment) add = 'void bl_dgemm_straprim_naive%d( int ms, int ns, int ks, ' % index add += ', '.join(['double* %s' % getBlockName( 0, i, dims_mix ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) add += ', int lda, ' add += ', '.join(['double* %s' % getBlockName( 1, i, dims_mix ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) add += ', int ldb, ' add += ', '.join(['double* %s' % getBlockName( 2, i, dims_mix ) \ for i, c in enumerate(c_coeffs) if is_nonzero(c)]) add += ', int ldc, int bl_ic_nt ) {' #add += ', '.join(['double* %s%d' % ( 'a', i ) for i in range( num_nonzero(a_coeffs) )]) #add += ', lda, ' #add += ', '.join(['double* %s%d' % ( 'b', i ) for i in range( num_nonzero(b_coeffs) )]) #add += ', ldb, ' #add += ', '.join(['double* %s%d' % ( 'c', i ) for i in range( num_nonzero(c_coeffs) )]) #add += ', ldc ) {' write_line(myfile, 0, add) write_line(myfile, 1, 'int ldS = ms, nS = ks, ldT = ks, nT = ns, ldM = ms, nM = ns;') def para_ld(coeff_index): if (coeff_index == 0): mm = 'ms' nn = 'ks' elif (coeff_index == 1): mm = 'ks' nn = 'ns' elif (coeff_index == 2): mm = 'ms' nn = 'ns' else: print("Wrong coeff_index\n") return str(mm) + ', ' + str(nn) + ', ' def addition_str(coeffs, coeff_index, mat_name, tmp_name, dims_mix): tmp_mat = '%s%d' % (tmp_name, index) add = '%s_Add%d( %s' % (tmp_name, index, para_ld(coeff_index)) for i, coeff in enumerate(coeffs): if is_nonzero(coeff): add += getBlockName(coeff_index, i, dims_mix) + ', ' add += 'ld%s, ' % (mat_name) add += tmp_mat + ', ld%s, bl_ic_nt );' % tmp_name return add # Write the adds to temps if necessary if need_tmp_mat(a_coeffs): instantiate_tmp(myfile, 'S', index) write_line(myfile, 1, addition_str(a_coeffs, 0, 'a', 'S', dims_mix)) if need_tmp_mat(b_coeffs): instantiate_tmp(myfile, 'T', index) write_line(myfile, 1, addition_str(b_coeffs, 1, 'b', 'T', dims_mix)) inst = 'double* M%d = bl_malloc_aligned( ldM, nM, sizeof(double) );' % ( index) write_line(myfile, 1, inst) write_line(myfile, 1, 'memset( M%d, 0, sizeof(double) * ldM * nM );' % (index)) res_mat = 'M%d' % (index) ## Handle the case where there is one non-zero coefficient and it is ## not equal to one. We need to propagate the multiplier information. #a_nonzero_coeffs = filter(is_nonzero, a_coeffs) #b_nonzero_coeffs = filter(is_nonzero, b_coeffs) #if len(a_nonzero_coeffs) == 1 and a_nonzero_coeffs[0] != 1: # write_line(myfile, 1, '%s.UpdateMultiplier(Scalar(%s));' % (res_mat, # a_nonzero_coeffs[0])) #if len(b_nonzero_coeffs) == 1 and b_nonzero_coeffs[0] != 1: # write_line(myfile, 1, '%s.UpdateMultiplier(Scalar(%s));' % (res_mat, # b_nonzero_coeffs[0])) def subblock_name(coeffs, coeff_index, mat_name, tmp_name, dims_mix): if need_tmp_mat(coeffs): return '%s%d' % (tmp_name, index) else: loc = [i for i, c in enumerate(coeffs) if is_nonzero(c)] return getBlockName(coeff_index, loc[0], dims_mix) def subblock_ld(coeffs, mat_name, tmp_name): if need_tmp_mat(coeffs): return '%s' % (tmp_name) else: return mat_name # Finally, write the actual call to matrix multiply. write_line( myfile, 1, 'bl_dgemm( ms, ns, ks, %s, ld%s, %s, ld%s, %s, ldM );' % (subblock_name(a_coeffs, 0, 'a', 'S', dims_mix), subblock_ld(a_coeffs, 'a', 'S'), subblock_name(b_coeffs, 1, 'b', 'T', dims_mix), subblock_ld(b_coeffs, 'b', 'T'), res_mat)) write_line(myfile, 1, addition_str(c_coeffs, 2, 'c', 'M', dims_mix)) # If we are not in parallel mode, de-allocate the temporary matrices if need_tmp_mat(a_coeffs): write_line(myfile, 1, 'free( S%d );' % (index)) if need_tmp_mat(b_coeffs): write_line(myfile, 1, 'free( T%d );' % (index)) write_line(myfile, 1, 'free( M%d );' % (index)) write_line(myfile, 0, '}') write_break(myfile)
def gen_model_coefficient( coeff_filename_mix, level_mix ): coeffs_mix = [] idx = 0 for coeff_file in coeff_filename_mix: coeffs = read_coeffs( coeff_file ) level = level_mix[idx] for level_id in range( level ): coeffs_mix.append( coeffs ) idx += 1 cur_coeffs = generateCoeffs( coeffs_mix ) #N_A_mul = 0 #N_B_mul = 0 #N_C_mul = 0 #N_A_add = 0 #N_B_add = 0 #N_C_add = 0 abc_counter = [ 0 for i in range(6) ] ab_counter = [ 0 for i in range(6) ] naive_counter = [ 0 for i in range(6) ] #N_mul = 0 #N_A_add = 0 #N_B_add = 0 #N_C_add = 0 comp_counter = [ 0 for i in range(4) ] for i, coeff_set in enumerate( transpose( cur_coeffs[0] ) ): nonzero_coeffs = [coeff for coeff in coeff_set if is_nonzero(coeff)] nnz = len( nonzero_coeffs ) #if ( nnz == 1 ): abc_counter[0] += nnz ab_counter[0] += nnz naive_counter[0] += 1 naive_counter[3] += nnz + 1 # if nnz == 1, naive_counter[3] += 0 comp_counter[1] += nnz - 1 for i, coeff_set in enumerate( transpose( cur_coeffs[1] ) ): nonzero_coeffs = [coeff for coeff in coeff_set if is_nonzero(coeff)] nnz = len( nonzero_coeffs ) #if ( nnz == 1 ): abc_counter[1] += nnz ab_counter[1] += nnz naive_counter[1] += 1 naive_counter[4] += nnz + 1 # if nnz == 1, naive_counter[4] += 0 comp_counter[2] += nnz - 1 for i, coeff_set in enumerate( transpose( cur_coeffs[2] ) ): nonzero_coeffs = [coeff for coeff in coeff_set if is_nonzero(coeff)] nnz = len( nonzero_coeffs ) #if ( nnz == 1 ): abc_counter[2] += nnz ab_counter[2] += 1 ab_counter[5] += 3 * nnz naive_counter[2] += 1 naive_counter[5] += 3 * nnz comp_counter[0] += 1 comp_counter[3] += nnz return [ comp_counter, abc_counter, ab_counter, naive_counter ]
def write_straprim_ab_function(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims_mix): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims_mix), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 0, comment) add = 'void bl_dgemm_straprim_ab%d( int m, int n, int k, ' % index add += ', '.join( ['double* %s%d' % ('a', i) for i in range(num_nonzero(a_coeffs))]) add += ', int lda, ' add += ', '.join( ['double* %s%d' % ('b', i) for i in range(num_nonzero(b_coeffs))]) add += ', int ldb, ' add += ', '.join( ['double* %s%d' % ('c', i) for i in range(num_nonzero(c_coeffs))]) add += ', int ldc, double *packA, double *packB, int bl_ic_nt ) {' write_line(myfile, 0, add) write_line(myfile, 1, 'int i, j, p, ic, ib, jc, jb, pc, pb;') write_line(myfile, 1, 'int ldM = m, nM = n;') write_line(myfile, 1, 'double *M = bl_malloc_aligned( ldM, nM, sizeof(double) );') ##################### write_line(myfile, 1, 'memset( M, 0, sizeof(double) * ldM * nM );') write_line(myfile, 1, 'for ( jc = 0; jc < n; jc += DGEMM_NC ) {') write_line(myfile, 2, 'jb = min( n - jc, DGEMM_NC );') write_line(myfile, 2, 'for ( pc = 0; pc < k; pc += DGEMM_KC ) {') write_line(myfile, 3, 'pb = min( k - pc, DGEMM_KC );') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line( myfile, 3, '#pragma omp parallel for num_threads( bl_ic_nt ) private( j )') write_line(myfile, 0, '#endif') write_line(myfile, 3, 'for ( j = 0; j < jb; j += DGEMM_NR ) {') add = 'packB_add_stra_ab%d( min( jb - j, DGEMM_NR ), pb, ' % index add += ', '.join([ '&%s%d[ pc + (jc+j)*ldb ]' % ('b', i) for i in range(num_nonzero(b_coeffs)) ]) add += ', ldb, &packB[ j * pb ] );' write_line(myfile, 4, add) write_line(myfile, 3, '}') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line( myfile, 3, '#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i )') write_line(myfile, 0, '#endif') write_line(myfile, 3, '{') write_line(myfile, 4, 'int tid = omp_get_thread_num();') write_line(myfile, 4, 'int my_start;') write_line(myfile, 4, 'int my_end;') write_line(myfile, 4, 'bl_get_range( m, DGEMM_MR, &my_start, &my_end );') write_line(myfile, 4, 'for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {') write_line(myfile, 5, 'ib = min( my_end - ic, DGEMM_MC );') write_line(myfile, 5, 'for ( i = 0; i < ib; i += DGEMM_MR ) {') add = 'packA_add_stra_ab%d( min( ib - i, DGEMM_MR ), pb, ' % index add += ', '.join([ '&%s%d[ pc*lda + (ic+i) ]' % ('a', i) for i in range(num_nonzero(a_coeffs)) ]) add += ', lda, &packA[ tid * DGEMM_MC * pb + i * pb ] );' write_line(myfile, 6, add) write_line(myfile, 5, '}') add = 'bl_macro_kernel_stra_ab( ib, jb, pb, packA + tid * DGEMM_MC * pb, packB, &M[ jc * ldM + ic ], ldM );' write_line(myfile, 5, add) write_line(myfile, 4, '}') write_line(myfile, 3, '}') write_line(myfile, 2, '}') write_line(myfile, 1, '}') ############################ add = 'M_Add%d( m, n, ' % (index) if len(c_coeffs) > 0: nonzero_coeffs = [coeff for coeff in c_coeffs if is_nonzero(coeff)] nnz = len(nonzero_coeffs) add += ', '.join(['%s%d' % ('c', i) for i in range(nnz)]) add += ', ldc, M, ldM, bl_ic_nt );' write_line(myfile, 1, add) write_line(myfile, 1, 'free( M );') write_line(myfile, 0, '}') write_break(myfile)
def write_macro_func( myfile, coeffs, index, mat_name ): ''' Write the add function for a set of coefficients. coeffs is the set of coefficients used for the add ''' nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)] nnz = len( nonzero_coeffs ) add = 'inline void bl_macro_kernel_stra_abc%d( int m, int n, int k, double *packA, double *packB, ' % ( index ) add += ', '.join(['double *%s%d' % ( mat_name, i ) for i in range(nnz)]) add += ', int ld%s ) {' % (mat_name) write_line(myfile, 0, add) write_line( myfile, 1, 'int i, j;' ) write_line( myfile, 1, 'aux_t aux;' ) write_line( myfile, 1, 'aux.b_next = packB;' ) write_line( myfile, 1, 'for ( j = 0; j < n; j += DGEMM_NR ) {' ) write_line( myfile, 1, ' aux.n = min( n - j, DGEMM_NR );' ) write_line( myfile, 1, ' for ( i = 0; i < m; i += DGEMM_MR ) {' ) write_line( myfile, 1, ' aux.m = min( m - i, DGEMM_MR );' ) write_line( myfile, 1, ' if ( i + DGEMM_MR >= m ) {' ) write_line( myfile, 1, ' aux.b_next += DGEMM_NR * k;' ) write_line( myfile, 1, ' }' ) #NEED to do: c_coeff -> pass in the parameters! #Generate the micro-kernel outside #abc_micro_kernel_gen.generate_kernel_header( my_kernel_header_file, nonzero_coeffs, index ) #abc_micro_kernel_gen.generate_micro_kernel( my_micro_kernel_file, nonzero_coeffs, index ) #generate the function caller #if nnz <= 23 and not contain_nontrivial( nonzero_coeffs ): # add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index # add += '(unsigned long long) ld%s, ' % mat_name # add += ', '.join( ['&%s%d[ j * ld%s + i ]' % ( mat_name, i, mat_name ) for i in range( nnz )] ) # add += ', &aux );' # write_line(myfile, 3, add) #else: # write_mulstrassen_kernel_caller( myfile, nonzero_coeffs ) if nnz <= 23: if not contain_nontrivial( nonzero_coeffs ): add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index add += '(unsigned long long) ld%s, ' % mat_name add += ', '.join( ['&%s%d[ j * ld%s + i ]' % ( mat_name, i, mat_name ) for i in range( nnz )] ) add += ', &aux );' write_line(myfile, 3, add) else: write_line( myfile, 3, 'double alpha_list[%d];' % nnz ) add = '; '.join( [ 'alpha_list[%d]= (double)(%s)' % ( j, coeff ) for j, coeff in enumerate(nonzero_coeffs) ] ) add += ';' write_line( myfile, 3, add ) add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index add += '(unsigned long long) ld%s, ' % mat_name add += ', '.join( ['&%s%d[ j * ld%s + i ]' % ( mat_name, i, mat_name ) for i in range( nnz )] ) add += ', alpha_list , &aux );' write_line(myfile, 3, add) else: write_mulstrassen_kernel_caller( myfile, nonzero_coeffs ) #write_mulstrassen_kernel_caller( myfile, nonzero_coeffs ) write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '}') # end of function
def write_straprim_abc_function( myfile, index, a_coeffs, b_coeffs, c_coeffs, dims_mix ): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([' %s += %s * M%d' % ( getBlockName( 2, i, dims_mix ), c, index ) for i, c in enumerate(c_coeffs) if is_nonzero(c)]) comment += ';' write_line(myfile, 0, comment) add = 'void bl_dgemm_straprim_abc%d( int m, int n, int k, ' % index add += ', '.join(['double* %s%d' % ( 'a', i ) for i in range( num_nonzero(a_coeffs) )]) add += ', int lda, ' add += ', '.join(['double* %s%d' % ( 'b', i ) for i in range( num_nonzero(b_coeffs) )]) add += ', int ldb, ' add += ', '.join(['double* %s%d' % ( 'c', i ) for i in range( num_nonzero(c_coeffs) )]) add += ', int ldc, double *packA, double *packB, int bl_ic_nt ) {' write_line( myfile, 0, add ) write_line( myfile, 1, 'int i, j, p, ic, ib, jc, jb, pc, pb;' ) write_line( myfile, 1, 'for ( jc = 0; jc < n; jc += DGEMM_NC ) {' ) write_line( myfile, 2, 'jb = min( n - jc, DGEMM_NC );' ) write_line( myfile, 2, 'for ( pc = 0; pc < k; pc += DGEMM_KC ) {' ) write_line( myfile, 3, 'pb = min( k - pc, DGEMM_KC );' ) #write_line( myfile, 0, '#ifdef _PARALLEL_') #write_line( myfile, 3, '#pragma omp parallel for num_threads( bl_ic_nt ) private( j )' ) #write_line( myfile, 0, '#endif') write_line( myfile, 3, '{') write_line( myfile, 4, 'int tid = omp_get_thread_num();' ) write_line( myfile, 4, 'int my_start;' ) write_line( myfile, 4, 'int my_end;' ) write_line( myfile, 4, 'bl_get_range( jb, DGEMM_NR, &my_start, &my_end );' ) write_line( myfile, 4, 'for ( j = my_start; j < my_end; j += DGEMM_NR ) {' ) add = 'packB_add_stra_abc%d( min( jb - j, DGEMM_NR ), pb, ' % index add += ', '.join(['&%s%d[ pc + (jc+j)*ldb ]' % ( 'b', i ) for i in range( num_nonzero(b_coeffs) )]) add += ', ldb, &packB[ j * pb ] );' write_line( myfile, 5, add ) write_line( myfile, 4, '}') write_line( myfile, 3, '}' ) write_line( myfile, 0, '#ifdef _PARALLEL_') write_line( myfile, 0, '#pragma omp barrier') write_line( myfile, 0, '#endif') #write_line( myfile, 0, '#ifdef _PARALLEL_') #write_line( myfile, 3, '#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i )' ) #write_line( myfile, 0, '#endif') write_line( myfile, 3, '{' ) #write_line( myfile, 0, '#ifdef _PARALLEL_') write_line( myfile, 4, 'int tid = omp_get_thread_num();' ) write_line( myfile, 4, 'int my_start;' ) write_line( myfile, 4, 'int my_end;' ) write_line( myfile, 4, 'bl_get_range( m, DGEMM_MR, &my_start, &my_end );' ) #write_line( myfile, 0, '#else') #write_line( myfile, 4, 'int tid = 0;' ) #write_line( myfile, 4, 'int my_start = 0;' ) #write_line( myfile, 4, 'int my_end = m;' ) #write_line( myfile, 0, '#endif') write_line( myfile, 4, 'for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {' ) write_line( myfile, 5, 'ib = min( my_end - ic, DGEMM_MC );' ) write_line( myfile, 5, 'for ( i = 0; i < ib; i += DGEMM_MR ) {' ) add = 'packA_add_stra_abc%d( min( ib - i, DGEMM_MR ), pb, ' % index add += ', '.join(['&%s%d[ pc*lda + (ic+i) ]' % ( 'a', i ) for i in range( num_nonzero(a_coeffs) )]) add += ', lda, &packA[ tid * DGEMM_MC * pb + i * pb ] );' write_line( myfile, 6, add ) write_line( myfile, 5, '}' ) add = 'bl_macro_kernel_stra_abc%d( ib, jb, pb, packA + tid * DGEMM_MC * pb, packB, ' % index add += ', '.join(['&%s%d[ jc * ldc + ic ]' % ( 'c', i ) for i in range( num_nonzero(c_coeffs) )]) add += ', ldc );' write_line( myfile, 5, add ) write_line( myfile, 4, '}' ) write_line( myfile, 3, '}' ) write_line( myfile, 0, '#ifdef _PARALLEL_') write_line( myfile, 0, '#pragma omp barrier') write_line( myfile, 0, '#endif') write_line( myfile, 2, '}' ) write_line( myfile, 1, '}' ) write_line( myfile, 0, '#ifdef _PARALLEL_') write_line( myfile, 0, '#pragma omp barrier') write_line( myfile, 0, '#endif') write_line( myfile, 0, '}' ) write_break( myfile )