Beispiel #1
0
def write_multiply_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs,
                          dims_mix, num_multiplies):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix  ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([
        ' %s += %s * M%d' % (getBlockName(2, i, dims_mix), c, index)
        for i, c in enumerate(c_coeffs) if is_nonzero(c)
    ])
    comment += ';'
    write_line(myfile, 1, comment)

    add = 'bl_dgemm_straprim_naive%d( ms, ns, ks, ' % index

    add += ', '.join(['%s' % getBlockName( 0, i, dims_mix ) \
                      for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    add += ', lda, '
    add += ', '.join(['%s' % getBlockName( 1, i, dims_mix ) \
                      for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    add += ', ldb, '
    add += ', '.join(['%s' % getBlockName( 2, i, dims_mix ) \
                      for i, c in enumerate(c_coeffs) if is_nonzero(c)])
    add += ', ldc, bl_ic_nt );'
    write_line(myfile, 1, add)
Beispiel #2
0
def gen_naive_fmm(coeff_filename_mix, dims_mix, level_mix, outfile):

    coeffs_mix = []
    idx = 0
    for coeff_file in coeff_filename_mix:
        coeffs = read_coeffs(coeff_file)
        level = level_mix[idx]
        for level_id in range(level):
            coeffs_mix.append(coeffs)
        idx += 1

    dims_level_mix = []
    idx = 0
    for dims in dims_mix:
        level = level_mix[idx]
        for level_id in range(level):
            dims_level_mix.append(dims)
        idx += 1

    with open(outfile, 'w') as myfile:
        write_line(myfile, 0, '#include "bl_dgemm.h"')
        write_break(myfile)

        cur_coeffs = generateCoeffs(coeffs_mix)

        num_multiplies = len(cur_coeffs[0][0])

        create_add_functions(myfile, cur_coeffs)
        create_straprim_naive_functions(myfile, cur_coeffs, dims_level_mix,
                                        num_multiplies)

        write_line(
            myfile, 0,
            'void bl_dgemm_strassen_naive( int m, int n, int k, double *XA, int lda, double *XB, int ldb, double *XC, int ldc )'
        )
        write_line(myfile, 0, '{')

        write_naive_strassen_header(myfile)

        writePartition(myfile, dims_level_mix)

        write_break(myfile)

        create_straprim_caller(myfile, cur_coeffs, dims_level_mix,
                               num_multiplies)

        write_break(myfile)
        level_dim = exp_dim_mix(dims_level_mix)
        write_line(
            myfile, 1,
            'bl_dynamic_peeling( m, n, k, XA, lda, XB, ldb, XC, ldc, %d * DGEMM_MR, %d, %d * DGEMM_NR );'
            % (level_dim[0], level_dim[1], level_dim[2]))

        write_line(myfile, 0, '}')
Beispiel #3
0
def write_output_add(myfile, index, coeffs, dims, rank):
    add = 'M_Add%d( ' % (index)
    add += 'ms, ns, '
    for i, coeff in enumerate(coeffs):
        if is_nonzero(coeff):
            suffix = i
            #if suffix > rank:
            #    suffix = '_X%d' % (suffix - rank)
            add += 'M%s, ' % suffix
    add += 'ldM, '
    #output_mat = getBlockName( 2, index, dims, level )
    output_mat = getBlockName(2, index, dims)
    add += '%s, ldc, bl_ic_nt );' % output_mat
    write_line(myfile, 1, add)
Beispiel #4
0
def create_micro_functions( myfile, coeffs, kernel_header_filename ):
    write_line( myfile, 0, '#include "%s"' % kernel_header_filename )
    write_break( myfile )
    abc_micro_kernel_gen.write_common_rankk_macro_assembly( myfile )
    write_break( myfile )
    abc_micro_kernel_gen.macro_initialize_assembly( myfile )
    write_break( myfile )
    for i, coeff_set in enumerate( transpose( coeffs[2] ) ):
        if len( coeff_set ) > 0:
            nonzero_coeffs = [coeff for coeff in coeff_set if is_nonzero(coeff)]
            nnz = len( nonzero_coeffs )

            if nnz <= 23:
                abc_micro_kernel_gen.generate_micro_kernel( myfile, nonzero_coeffs, i )

            write_break( myfile )
Beispiel #5
0
def write_naive_strassen_header(myfile):
    write_line(myfile, 1, 'char *str;')
    write_line(myfile, 1, 'int  bl_ic_nt;')
    write_break(myfile)
    write_line(myfile, 1, '// Early return if possible')
    write_line(myfile, 1, 'if ( m == 0 || n == 0 || k == 0 ) {')
    write_line(myfile, 1,
               '    printf( "bl_dgemm_strassen_abc(): early return\\n" );')
    write_line(myfile, 1, '    return;')
    write_line(myfile, 1, '}')
    write_break(myfile)
    write_line(myfile, 1, '// sequential is the default situation')
    write_line(myfile, 1, 'bl_ic_nt = 1;')
    write_line(myfile, 1, '// check the environment variable')
    write_line(myfile, 1, 'str = getenv( "BLISLAB_IC_NT" );')
    write_line(myfile, 1, 'if ( str != NULL ) {')
    write_line(myfile, 1, '    bl_ic_nt = (int)strtol( str, NULL, 10 );')
    write_line(myfile, 1, '}')
    write_break(myfile)
Beispiel #6
0
def write_straprim_naive_function(myfile, index, a_coeffs, b_coeffs, c_coeffs,
                                  dims_mix, num_multiplies):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([
        ' %s += %s * M%d' % (getBlockName(2, i, dims_mix), c, index)
        for i, c in enumerate(c_coeffs) if is_nonzero(c)
    ])
    comment += ';'
    write_line(myfile, 0, comment)

    add = 'void bl_dgemm_straprim_naive%d( int ms, int ns, int ks, ' % index

    add += ', '.join(['double* %s' % getBlockName( 0, i, dims_mix ) \
                      for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    add += ', int lda, '
    add += ', '.join(['double* %s' % getBlockName( 1, i, dims_mix ) \
                      for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    add += ', int ldb, '
    add += ', '.join(['double* %s' % getBlockName( 2, i, dims_mix ) \
                      for i, c in enumerate(c_coeffs) if is_nonzero(c)])
    add += ', int ldc, int bl_ic_nt ) {'

    #add += ', '.join(['double* %s%d' % ( 'a', i ) for i in range( num_nonzero(a_coeffs) )])
    #add += ', lda, '
    #add += ', '.join(['double* %s%d' % ( 'b', i ) for i in range( num_nonzero(b_coeffs) )])
    #add += ', ldb, '
    #add += ', '.join(['double* %s%d' % ( 'c', i ) for i in range( num_nonzero(c_coeffs) )])
    #add += ', ldc ) {'

    write_line(myfile, 0, add)
    write_line(myfile, 1,
               'int ldS = ms, nS = ks, ldT = ks, nT = ns, ldM = ms, nM = ns;')

    def para_ld(coeff_index):
        if (coeff_index == 0):
            mm = 'ms'
            nn = 'ks'
        elif (coeff_index == 1):
            mm = 'ks'
            nn = 'ns'
        elif (coeff_index == 2):
            mm = 'ms'
            nn = 'ns'
        else:
            print("Wrong coeff_index\n")
        return str(mm) + ', ' + str(nn) + ', '

    def addition_str(coeffs, coeff_index, mat_name, tmp_name, dims_mix):
        tmp_mat = '%s%d' % (tmp_name, index)
        add = '%s_Add%d( %s' % (tmp_name, index, para_ld(coeff_index))
        for i, coeff in enumerate(coeffs):
            if is_nonzero(coeff):
                add += getBlockName(coeff_index, i, dims_mix) + ', '
        add += 'ld%s, ' % (mat_name)
        add += tmp_mat + ', ld%s, bl_ic_nt );' % tmp_name
        return add

    # Write the adds to temps if necessary
    if need_tmp_mat(a_coeffs):
        instantiate_tmp(myfile, 'S', index)
        write_line(myfile, 1, addition_str(a_coeffs, 0, 'a', 'S', dims_mix))

    if need_tmp_mat(b_coeffs):
        instantiate_tmp(myfile, 'T', index)
        write_line(myfile, 1, addition_str(b_coeffs, 1, 'b', 'T', dims_mix))

    inst = 'double* M%d = bl_malloc_aligned( ldM, nM, sizeof(double) );' % (
        index)
    write_line(myfile, 1, inst)
    write_line(myfile, 1,
               'memset( M%d, 0, sizeof(double) * ldM * nM );' % (index))

    res_mat = 'M%d' % (index)

    ## Handle the case where there is one non-zero coefficient and it is
    ## not equal to one.  We need to propagate the multiplier information.
    #a_nonzero_coeffs = filter(is_nonzero, a_coeffs)
    #b_nonzero_coeffs = filter(is_nonzero, b_coeffs)
    #if len(a_nonzero_coeffs) == 1 and a_nonzero_coeffs[0] != 1:
    #    write_line(myfile, 1, '%s.UpdateMultiplier(Scalar(%s));' % (res_mat,
    #                                                                a_nonzero_coeffs[0]))
    #if len(b_nonzero_coeffs) == 1 and b_nonzero_coeffs[0] != 1:
    #    write_line(myfile, 1, '%s.UpdateMultiplier(Scalar(%s));' % (res_mat,
    #                                                                b_nonzero_coeffs[0]))

    def subblock_name(coeffs, coeff_index, mat_name, tmp_name, dims_mix):
        if need_tmp_mat(coeffs):
            return '%s%d' % (tmp_name, index)
        else:
            loc = [i for i, c in enumerate(coeffs) if is_nonzero(c)]
            return getBlockName(coeff_index, loc[0], dims_mix)

    def subblock_ld(coeffs, mat_name, tmp_name):
        if need_tmp_mat(coeffs):
            return '%s' % (tmp_name)
        else:
            return mat_name

    # Finally, write the actual call to matrix multiply.
    write_line(
        myfile, 1, 'bl_dgemm( ms, ns, ks, %s, ld%s, %s, ld%s, %s, ldM );' %
        (subblock_name(a_coeffs, 0, 'a', 'S',
                       dims_mix), subblock_ld(a_coeffs, 'a', 'S'),
         subblock_name(b_coeffs, 1, 'b', 'T',
                       dims_mix), subblock_ld(b_coeffs, 'b', 'T'), res_mat))

    write_line(myfile, 1, addition_str(c_coeffs, 2, 'c', 'M', dims_mix))

    # If we are not in parallel mode, de-allocate the temporary matrices
    if need_tmp_mat(a_coeffs):
        write_line(myfile, 1, 'free( S%d );' % (index))
    if need_tmp_mat(b_coeffs):
        write_line(myfile, 1, 'free( T%d );' % (index))

    write_line(myfile, 1, 'free( M%d );' % (index))
    write_line(myfile, 0, '}')
    write_break(myfile)
Beispiel #7
0
def instantiate_tmp(myfile, tmp_name, mult_index):
    inst = 'double* %s%d = bl_malloc_aligned( ld%s, n%s, sizeof(double) );' % (
        tmp_name, mult_index, tmp_name, tmp_name)
    write_line(myfile, 1, inst)
Beispiel #8
0
def write_packm_func(myfile, coeffs, index, mat_name):
    ''' Write the add function for a set of coefficients.  This is a custom add
    function used for a single multiply in a single fast algorithm.

    coeffs is the set of coefficients used for the add
    '''
    nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)]
    nnz = len(nonzero_coeffs)
    add = 'inline void pack%s_add_stra_ab%d( int m, int n, ' % (mat_name,
                                                                index)
    add += ', '.join(['double *%s%d' % (mat_name, i) for i in range(nnz)])
    add += ', int ld%s, double *pack%s ' % (mat_name, mat_name)
    add += ') {'
    write_line(myfile, 0, add)

    write_line(myfile, 1, 'int i, j;')

    add = 'double '
    add += ', '.join(['*%s%d_pntr' % (mat_name, i) for i in range(nnz)])
    add += ', *pack%s_pntr;' % mat_name
    write_line(myfile, 1, add)

    if (mat_name == 'A'):
        ldp = 'DGEMM_MR'
        incp = '1'
        ldm = 'ld%s' % mat_name
        incm = '1'
    elif (mat_name == 'B'):
        ldp = 'DGEMM_NR'
        incp = '1'
        ldm = '1'
        incm = 'ld%s' % mat_name
    else:
        print "Wrong mat_name!"
    #ldp = 'DGEMM_MR' if (mat_name == 'A') else 'DGEMM_NR'

    write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {')
    write_line(myfile, 2,
               'pack%s_pntr = &pack%s[ %s * j ];' % (mat_name, mat_name, ldp))
    if ldm == '1':
        add = ''.join([
            '%s%d_pntr = &%s%d[ j ]; ' % (mat_name, i, mat_name, i)
            for i in range(nnz)
        ])
    else:
        add = ''.join([
            '%s%d_pntr = &%s%d[ %s * j ]; ' % (mat_name, i, mat_name, i, ldm)
            for i in range(nnz)
        ])
    write_line(myfile, 2, add)

    write_line(myfile, 2, 'for ( i = 0; i < %s; ++i ) {' % ldp)

    add = 'pack%s_pntr[ i ]' % mat_name + ' ='
    for j, coeff in enumerate(nonzero_coeffs):
        ind = j
        add += arith_expression_pntr(coeff, mat_name, ind, incm)

    add += ';'
    write_line(myfile, 3, add)

    write_line(myfile, 2, '}')
    write_line(myfile, 1, '}')

    write_line(myfile, 0, '}')  # end of function
Beispiel #9
0
def write_straprim_abc_function( myfile, index, a_coeffs, b_coeffs, c_coeffs, dims_mix ):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([' %s += %s * M%d' % ( getBlockName( 2, i, dims_mix ), c, index ) for i, c in enumerate(c_coeffs) if is_nonzero(c)])
    comment += ';'
    write_line(myfile, 0, comment)

    add = 'void bl_dgemm_straprim_abc%d( int m, int n, int k, ' % index

    add += ', '.join(['double* %s%d' % ( 'a', i ) for i in range( num_nonzero(a_coeffs) )])
    add += ', int lda, '
    add += ', '.join(['double* %s%d' % ( 'b', i ) for i in range( num_nonzero(b_coeffs) )])
    add += ', int ldb, '
    add += ', '.join(['double* %s%d' % ( 'c', i ) for i in range( num_nonzero(c_coeffs) )])
    add += ', int ldc, double *packA, double *packB, int bl_ic_nt ) {'

    write_line( myfile, 0, add )
    write_line( myfile, 1, 'int i, j, p, ic, ib, jc, jb, pc, pb;' )
    write_line( myfile, 1, 'for ( jc = 0; jc < n; jc += DGEMM_NC ) {' )
    write_line( myfile, 2, 'jb = min( n - jc, DGEMM_NC );' )
    write_line( myfile, 2, 'for ( pc = 0; pc < k; pc += DGEMM_KC ) {' )
    write_line( myfile, 3, 'pb = min( k - pc, DGEMM_KC );' )
    #write_line( myfile, 0, '#ifdef _PARALLEL_')
    #write_line( myfile, 3, '#pragma omp parallel for num_threads( bl_ic_nt ) private( j )' )
    #write_line( myfile, 0, '#endif')
    write_line( myfile, 3, '{')
    write_line( myfile, 4, 'int tid = omp_get_thread_num();' )
    write_line( myfile, 4, 'int my_start;' )
    write_line( myfile, 4, 'int my_end;' )
    write_line( myfile, 4, 'bl_get_range( jb, DGEMM_NR, &my_start, &my_end );' )
    write_line( myfile, 4, 'for ( j = my_start; j < my_end; j += DGEMM_NR ) {' )

    add = 'packB_add_stra_abc%d( min( jb - j, DGEMM_NR ), pb, ' % index
    add += ', '.join(['&%s%d[ pc + (jc+j)*ldb ]' % ( 'b', i ) for i in range( num_nonzero(b_coeffs) )])
    add += ', ldb, &packB[ j * pb ] );'
    write_line( myfile, 5, add )
    write_line( myfile, 4, '}')
    write_line( myfile, 3, '}' )

    write_line( myfile, 0, '#ifdef _PARALLEL_')
    write_line( myfile, 0, '#pragma omp barrier')
    write_line( myfile, 0, '#endif')
    #write_line( myfile, 0, '#ifdef _PARALLEL_')
    #write_line( myfile, 3, '#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i )' )
    #write_line( myfile, 0, '#endif')
    write_line( myfile, 3, '{' )
    #write_line( myfile, 0, '#ifdef _PARALLEL_')
    write_line( myfile, 4, 'int tid = omp_get_thread_num();' )
    write_line( myfile, 4, 'int my_start;' )
    write_line( myfile, 4, 'int my_end;' )
    write_line( myfile, 4, 'bl_get_range( m, DGEMM_MR, &my_start, &my_end );' )
    #write_line( myfile, 0, '#else')
    #write_line( myfile, 4, 'int tid = 0;' )
    #write_line( myfile, 4, 'int my_start = 0;' )
    #write_line( myfile, 4, 'int my_end = m;' )
    #write_line( myfile, 0, '#endif')
    write_line( myfile, 4, 'for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {' )
    write_line( myfile, 5, 'ib = min( my_end - ic, DGEMM_MC );' )
    write_line( myfile, 5, 'for ( i = 0; i < ib; i += DGEMM_MR ) {' )

    add = 'packA_add_stra_abc%d( min( ib - i, DGEMM_MR ), pb, ' % index
    add += ', '.join(['&%s%d[ pc*lda + (ic+i) ]' % ( 'a', i ) for i in range( num_nonzero(a_coeffs) )])
    add += ', lda, &packA[ tid * DGEMM_MC * pb + i * pb ] );'
    write_line( myfile, 6, add )

    write_line( myfile, 5, '}' )

    add = 'bl_macro_kernel_stra_abc%d( ib, jb, pb, packA + tid * DGEMM_MC * pb, packB, ' % index
    add += ', '.join(['&%s%d[ jc * ldc + ic ]' % ( 'c', i ) for i in range( num_nonzero(c_coeffs) )])
    add += ', ldc );'
    write_line( myfile, 5, add )

    write_line( myfile, 4, '}' )
    write_line( myfile, 3, '}' )
    write_line( myfile, 0, '#ifdef _PARALLEL_')
    write_line( myfile, 0, '#pragma omp barrier')
    write_line( myfile, 0, '#endif')
    write_line( myfile, 2, '}' )
    write_line( myfile, 1, '}' )

    write_line( myfile, 0, '#ifdef _PARALLEL_')
    write_line( myfile, 0, '#pragma omp barrier')
    write_line( myfile, 0, '#endif')
    write_line( myfile, 0, '}' )
    write_break( myfile )
Beispiel #10
0
def write_M_add_func(myfile, coeffs, index, mat_name):
    nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)]
    nnz = len(nonzero_coeffs)
    add = 'void %s_Add%d( int m, int n, ' % (mat_name, index)
    add += ', '.join(['double* %s%d' % (mat_name, i) for i in range(nnz)])
    add += ', int ld%s, double* R, int ldR, int bl_ic_nt ' % (mat_name)
    # Handle the C := alpha A * B + beta C
    is_output = (mat_name == 'M')
    #is_output = False
    #if is_output:
    #    add += ', double beta'
    add += ') {'
    write_line(myfile, 0, add)

    # Handle the C := alpha A * B + beta C

    write_line(myfile, 1, 'int i, j;')
    #write_line( myfile, 1, '#pragma omp parallel for schedule( dynamic )' )
    write_line(myfile, 0, '#ifdef _PARALLEL_')
    write_line(myfile, 1, '#pragma omp parallel for num_threads( bl_ic_nt )')
    write_line(myfile, 0, '#endif')
    write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {')
    write_line(myfile, 2, 'for ( i = 0; i < m; ++i ) {')
    for j, coeff in enumerate(nonzero_coeffs):
        ind = j
        add = data_access(mat_name, str(ind)) + ' += '
        add += arith_expression(coeff, 'R', '')
        add += ';'
        write_line(myfile, 3, add)
    write_line(myfile, 2, '}')
    write_line(myfile, 1, '}')
    write_line(myfile, 0, '}')  # end of function
Beispiel #11
0
def write_ab_strassen_header(myfile):
    write_line(myfile, 1, 'double *packA, *packB;')
    write_line(myfile, 1, 'char *str;')
    write_line(myfile, 1, 'int  bl_ic_nt;')
    write_break(myfile)
    write_line(myfile, 1, '// Early return if possible')
    write_line(myfile, 1, 'if ( m == 0 || n == 0 || k == 0 ) {')
    write_line(myfile, 1,
               '    printf( "bl_dgemm_strassen_ab(): early return\\n" );')
    write_line(myfile, 1, '    return;')
    write_line(myfile, 1, '}')
    write_break(myfile)
    write_line(myfile, 1, '// sequential is the default situation')
    write_line(myfile, 1, 'bl_ic_nt = 1;')
    write_line(myfile, 1, '// check the environment variable')
    write_line(myfile, 1, 'str = getenv( "BLISLAB_IC_NT" );')
    write_line(myfile, 1, 'if ( str != NULL ) {')
    write_line(myfile, 1, '    bl_ic_nt = (int)strtol( str, NULL, 10 );')
    write_line(myfile, 1, '}')
    write_break(myfile)
    write_line(myfile, 1, '// Allocate packing buffers')
    write_line(
        myfile, 1,
        'packA  = bl_malloc_aligned( DGEMM_KC, ( DGEMM_MC + 1 ) * bl_ic_nt, sizeof(double) );'
    )
    write_line(
        myfile, 1,
        'packB  = bl_malloc_aligned( DGEMM_KC, ( DGEMM_NC + 1 )           , sizeof(double) );'
    )

    write_break(myfile)
Beispiel #12
0
def write_straprim_ab_function(myfile, index, a_coeffs, b_coeffs, c_coeffs,
                               dims_mix):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims_mix ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims_mix ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([
        ' %s += %s * M%d' % (getBlockName(2, i, dims_mix), c, index)
        for i, c in enumerate(c_coeffs) if is_nonzero(c)
    ])
    comment += ';'
    write_line(myfile, 0, comment)

    add = 'void bl_dgemm_straprim_ab%d( int m, int n, int k, ' % index

    add += ', '.join(
        ['double* %s%d' % ('a', i) for i in range(num_nonzero(a_coeffs))])
    add += ', int lda, '
    add += ', '.join(
        ['double* %s%d' % ('b', i) for i in range(num_nonzero(b_coeffs))])
    add += ', int ldb, '
    add += ', '.join(
        ['double* %s%d' % ('c', i) for i in range(num_nonzero(c_coeffs))])
    add += ', int ldc, double *packA, double *packB, int bl_ic_nt ) {'

    write_line(myfile, 0, add)
    write_line(myfile, 1, 'int i, j, p, ic, ib, jc, jb, pc, pb;')
    write_line(myfile, 1, 'int ldM = m, nM = n;')
    write_line(myfile, 1,
               'double *M = bl_malloc_aligned( ldM, nM, sizeof(double) );')
    #####################
    write_line(myfile, 1, 'memset( M, 0, sizeof(double) * ldM * nM );')
    write_line(myfile, 1, 'for ( jc = 0; jc < n; jc += DGEMM_NC ) {')
    write_line(myfile, 2, 'jb = min( n - jc, DGEMM_NC );')
    write_line(myfile, 2, 'for ( pc = 0; pc < k; pc += DGEMM_KC ) {')
    write_line(myfile, 3, 'pb = min( k - pc, DGEMM_KC );')
    write_line(myfile, 0, '#ifdef _PARALLEL_')
    write_line(
        myfile, 3,
        '#pragma omp parallel for num_threads( bl_ic_nt ) private( j )')
    write_line(myfile, 0, '#endif')
    write_line(myfile, 3, 'for ( j = 0; j < jb; j += DGEMM_NR ) {')

    add = 'packB_add_stra_ab%d( min( jb - j, DGEMM_NR ), pb, ' % index
    add += ', '.join([
        '&%s%d[ pc + (jc+j)*ldb ]' % ('b', i)
        for i in range(num_nonzero(b_coeffs))
    ])
    add += ', ldb, &packB[ j * pb ] );'
    write_line(myfile, 4, add)

    write_line(myfile, 3, '}')
    write_line(myfile, 0, '#ifdef _PARALLEL_')
    write_line(
        myfile, 3,
        '#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i )')
    write_line(myfile, 0, '#endif')
    write_line(myfile, 3, '{')
    write_line(myfile, 4, 'int tid = omp_get_thread_num();')
    write_line(myfile, 4, 'int my_start;')
    write_line(myfile, 4, 'int my_end;')
    write_line(myfile, 4, 'bl_get_range( m, DGEMM_MR, &my_start, &my_end );')
    write_line(myfile, 4,
               'for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {')
    write_line(myfile, 5, 'ib = min( my_end - ic, DGEMM_MC );')
    write_line(myfile, 5, 'for ( i = 0; i < ib; i += DGEMM_MR ) {')

    add = 'packA_add_stra_ab%d( min( ib - i, DGEMM_MR ), pb, ' % index
    add += ', '.join([
        '&%s%d[ pc*lda + (ic+i) ]' % ('a', i)
        for i in range(num_nonzero(a_coeffs))
    ])
    add += ', lda, &packA[ tid * DGEMM_MC * pb + i * pb ] );'
    write_line(myfile, 6, add)

    write_line(myfile, 5, '}')

    add = 'bl_macro_kernel_stra_ab( ib, jb, pb, packA + tid * DGEMM_MC * pb, packB, &M[ jc * ldM + ic ], ldM );'
    write_line(myfile, 5, add)

    write_line(myfile, 4, '}')
    write_line(myfile, 3, '}')
    write_line(myfile, 2, '}')
    write_line(myfile, 1, '}')

    ############################
    add = 'M_Add%d( m, n, ' % (index)

    if len(c_coeffs) > 0:
        nonzero_coeffs = [coeff for coeff in c_coeffs if is_nonzero(coeff)]
        nnz = len(nonzero_coeffs)
        add += ', '.join(['%s%d' % ('c', i) for i in range(nnz)])

    add += ', ldc, M, ldM, bl_ic_nt );'

    write_line(myfile, 1, add)

    write_line(myfile, 1, 'free( M );')

    write_line(myfile, 0, '}')
    write_break(myfile)
Beispiel #13
0
def write_macro_func( myfile, coeffs, index, mat_name ):
    ''' Write the add function for a set of coefficients.
    coeffs is the set of coefficients used for the add
    '''
    nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)]
    nnz = len( nonzero_coeffs )
    add = 'inline void bl_macro_kernel_stra_abc%d( int m, int n, int k, double *packA, double *packB, ' % ( index )
    add += ', '.join(['double *%s%d' % ( mat_name, i ) for i in range(nnz)])
    add += ', int ld%s ) {' % (mat_name)
    write_line(myfile, 0, add)

    write_line( myfile, 1, 'int i, j;' )
    write_line( myfile, 1, 'aux_t aux;' )
    write_line( myfile, 1, 'aux.b_next = packB;' )

    write_line( myfile, 1, 'for ( j = 0; j < n; j += DGEMM_NR ) {' )
    write_line( myfile, 1, '    aux.n  = min( n - j, DGEMM_NR );' )
    write_line( myfile, 1, '    for ( i = 0; i < m; i += DGEMM_MR ) {' )
    write_line( myfile, 1, '        aux.m = min( m - i, DGEMM_MR );' )
    write_line( myfile, 1, '        if ( i + DGEMM_MR >= m ) {' )
    write_line( myfile, 1, '            aux.b_next += DGEMM_NR * k;' )
    write_line( myfile, 1, '        }' )

    #NEED to do: c_coeff -> pass in the parameters!

    #Generate the micro-kernel outside
    #abc_micro_kernel_gen.generate_kernel_header( my_kernel_header_file, nonzero_coeffs, index )
    #abc_micro_kernel_gen.generate_micro_kernel( my_micro_kernel_file, nonzero_coeffs, index )
    #generate the function caller

    #if nnz <= 23 and not contain_nontrivial( nonzero_coeffs ):
    #    add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index
    #    add += '(unsigned long long) ld%s, ' % mat_name
    #    add += ', '.join( ['&%s%d[ j * ld%s + i ]' % ( mat_name, i, mat_name ) for i in range( nnz )] )
    #    add += ', &aux );'
    #    write_line(myfile, 3, add)
    #else:
    #    write_mulstrassen_kernel_caller( myfile, nonzero_coeffs )

    if nnz <= 23:
        if  not contain_nontrivial( nonzero_coeffs ):
            add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index
            add += '(unsigned long long) ld%s, ' % mat_name
            add += ', '.join( ['&%s%d[ j * ld%s + i ]' % ( mat_name, i, mat_name ) for i in range( nnz )] )
            add += ', &aux );'
            write_line(myfile, 3, add)
        else:
            write_line( myfile, 3, 'double alpha_list[%d];' % nnz )
            add = '; '.join( [ 'alpha_list[%d]= (double)(%s)' % ( j, coeff ) for j, coeff in enumerate(nonzero_coeffs) ] )
            add += ';'
            write_line( myfile, 3, add )
            add = '( bl_dgemm_micro_kernel_stra_abc%d ) ( k, &packA[ i * k ], &packB[ j * k ], ' % index
            add += '(unsigned long long) ld%s, ' % mat_name
            add += ', '.join( ['&%s%d[ j * ld%s + i ]' % ( mat_name, i, mat_name ) for i in range( nnz )] )
            add += ', alpha_list , &aux );'
            write_line(myfile, 3, add)
    else:
        write_mulstrassen_kernel_caller( myfile, nonzero_coeffs )

    #write_mulstrassen_kernel_caller( myfile, nonzero_coeffs )

    write_line(myfile, 2, '}')
    write_line(myfile, 1, '}')

    write_line(myfile, 0, '}')  # end of function
Beispiel #14
0
def gen_abc_fmm( coeff_filename_mix, dims_mix, level_mix, outfilename, micro_kernel_filename, kernel_header_filename ):

    coeffs_mix = []
    idx = 0
    for coeff_file in coeff_filename_mix:
        coeffs = read_coeffs( coeff_file )
        level = level_mix[idx]
        for level_id in range( level ):
            coeffs_mix.append( coeffs )
        idx += 1

    dims_level_mix = []
    idx = 0
    for dims in dims_mix:
        level = level_mix[idx]
        for level_id in range( level ):
            dims_level_mix.append( dims )
        idx += 1

    with open( outfilename, 'w' ) as myfile:
        write_line( myfile, 0, '#include "%s"' % kernel_header_filename[10:] )
        write_line( myfile, 0, '#include "bl_dgemm.h"' )
        write_break( myfile )

        cur_coeffs = generateCoeffs( coeffs_mix )

        num_multiplies = len(cur_coeffs[0][0])

        create_packm_functions( myfile, cur_coeffs )

        my_micro_file = open( micro_kernel_filename, 'w' ) 
        create_micro_functions( my_micro_file, cur_coeffs, kernel_header_filename[10:] )

        my_kernel_header = open ( kernel_header_filename, 'w' )
        create_kernel_header( my_kernel_header, cur_coeffs )

        create_macro_functions( myfile, cur_coeffs )

        create_straprim_abc_functions( myfile, cur_coeffs, dims_level_mix )


        write_line( myfile, 0, 'void bl_dgemm_strassen_abc( int m, int n, int k, double *XA, int lda, double *XB, int ldb, double *XC, int ldc )' )
        write_line( myfile, 0, '{' )

        write_abc_strassen_header( myfile )

        writePartition( myfile, dims_level_mix )

        write_break( myfile )

        write_line( myfile, 0, '#ifdef _PARALLEL_')
        write_line( myfile, 1, '#pragma omp parallel num_threads( bl_ic_nt )' )
        write_line( myfile, 0, '#endif')
        write_line( myfile, 1, '{' )
        create_straprim_caller( myfile, cur_coeffs, dims_level_mix, num_multiplies )
        write_line( myfile, 1, '}' )

        write_break( myfile )
        level_dim = exp_dim_mix( dims_level_mix )
        write_line( myfile, 1, 'bl_dynamic_peeling( m, n, k, XA, lda, XB, ldb, XC, ldc, %d * DGEMM_MR, %d, %d * DGEMM_NR );' % ( level_dim[0], level_dim[1], level_dim[2] ) )

        write_break( myfile )
        write_line( myfile, 1, '//free( packA );' )
        write_line( myfile, 1, '//free( packB );' )

        write_line( myfile, 0, '}' )
Beispiel #15
0
def write_abc_strassen_header( myfile ):
    write_line( myfile, 1, 'double *packA, *packB;' );
    write_break( myfile )
    write_line( myfile, 1, 'int bl_ic_nt = bl_read_nway_from_env( "BLISLAB_IC_NT" );' );
    write_break( myfile )
    write_line( myfile, 1, '//// Allocate packing buffers' );
    write_line( myfile, 1, '//packA  = bl_malloc_aligned( DGEMM_KC, ( DGEMM_MC + 1 ) * bl_ic_nt, sizeof(double) );' );
    write_line( myfile, 1, '//packB  = bl_malloc_aligned( DGEMM_KC, ( DGEMM_NC + 1 )           , sizeof(double) );' );

    write_line( myfile, 1, 'bl_malloc_packing_pool( &packA, &packB, n, bl_ic_nt );' )

    write_break( myfile )
Beispiel #16
0
def create_macro_functions(myfile, coeffs):
    add = 'inline void bl_macro_kernel_stra_ab( int m, int n, int k, double *packA, double *packB, double *C, int ldC ) {'
    write_line(myfile, 0, add)

    write_line(myfile, 1, 'int i, j;')
    write_line(myfile, 1, 'aux_t aux;')
    write_line(myfile, 1, 'aux.b_next = packB;')

    write_line(myfile, 1, 'for ( j = 0; j < n; j += DGEMM_NR ) {')
    write_line(myfile, 1, '    aux.n  = min( n - j, DGEMM_NR );')
    write_line(myfile, 1, '    for ( i = 0; i < m; i += DGEMM_MR ) {')
    write_line(myfile, 1, '        aux.m = min( m - i, DGEMM_MR );')
    write_line(myfile, 1, '        if ( i + DGEMM_MR >= m ) {')
    write_line(myfile, 1, '            aux.b_next += DGEMM_NR * k;')
    write_line(myfile, 1, '        }')

    add = '( *bl_micro_kernel )( k, &packA[ i * k ], &packB[ j * k ], &C[ j * ldC + i ], (unsigned long long) ldC, &aux );'
    #add = '( bl_dgemm_asm_8x4_beta0 )( k, &packA[ i * k ], &packB[ j * k ], &C[ j * ldC + i ], (unsigned long long) ldC, &aux );'
    write_line(myfile, 3, add)

    write_line(myfile, 2, '}')
    write_line(myfile, 1, '}')
    write_line(myfile, 0, '}')  # end of function
Beispiel #17
0
def write_add_func(myfile, coeffs, index, mat_name):
    ''' Write the add function for a set of coefficients.  This is a custom add
    function used for a single multiply in a single fast algorithm.

    coeffs is the set of coefficients used for the add
    '''
    nonzero_coeffs = [coeff for coeff in coeffs if is_nonzero(coeff)]
    nnz = len(nonzero_coeffs)
    add = 'void %s_Add%d( int m, int n, ' % (mat_name, index)
    add += ', '.join(['double* %s%d' % (mat_name, i) for i in range(nnz)])
    add += ', int ld%s, double* R, int ldR, int bl_ic_nt ' % (mat_name)
    # Handle the C := alpha A * B + beta C
    is_output = (mat_name == 'M')
    #is_output = False
    #if is_output:
    #    add += ', double beta'
    add += ') {'
    write_line(myfile, 0, add)

    # Handle the C := alpha A * B + beta C
    if is_output:
        #write_line( myfile, 1, 'int i, j;' )
        #write_line( myfile, 1, 'for ( j = 0; j < n; ++j ) {')
        #write_line( myfile, 2, 'for ( i = 0; i < m; ++i ) {')
        #add = data_access('R') + ' ='
        #for j, coeff in enumerate(nonzero_coeffs):
        #    ind = j
        #    add += arith_expression(coeff, mat_name, ind )
        #add += ' + %s;' % (data_access('R'))
        #write_line(myfile, 3, add)
        #write_line(myfile, 2, '}')
        #write_line(myfile, 1, '}')

        write_line(myfile, 1, 'int i, j;')
        #write_line( myfile, 1, '#pragma omp parallel for schedule( dynamic )' )
        write_line(myfile, 0, '#ifdef _PARALLEL_')
        write_line(myfile, 1,
                   '#pragma omp parallel for num_threads( bl_ic_nt )')
        write_line(myfile, 0, '#endif')
        write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {')
        write_line(myfile, 2, 'for ( i = 0; i < m; ++i ) {')
        for j, coeff in enumerate(nonzero_coeffs):
            ind = j
            add = data_access(mat_name, str(ind)) + ' += '
            add += arith_expression(coeff, 'R', '')
            add += ';'
            write_line(myfile, 3, add)
        write_line(myfile, 2, '}')
        write_line(myfile, 1, '}')

        #write_line( myfile, 1, 'int i, j;' )
        #for j, coeff in enumerate(nonzero_coeffs):
        #    write_line( myfile, 1, 'for ( j = 0; j < n; ++j ) {')
        #    write_line( myfile, 2, 'for ( i = 0; i < m; ++i ) {')
        #    ind = j
        #    add = data_access( mat_name, str(ind) )  + ' += '
        #    add += arith_expression(coeff, 'R', '' )
        #    add += ';'
        #    write_line(myfile, 3, add)
        #    write_line(myfile, 2, '}')
        #    write_line(myfile, 1, '}')
    else:
        write_line(myfile, 1, 'int i, j;')
        write_line(myfile, 0, '#ifdef _PARALLEL_')
        write_line(myfile, 1,
                   '#pragma omp parallel for num_threads( bl_ic_nt )')
        write_line(myfile, 0, '#endif')
        write_line(myfile, 1, 'for ( j = 0; j < n; ++j ) {')
        write_line(myfile, 2, 'for ( i = 0; i < m; ++i ) {')
        add = data_access('R') + ' ='
        for j, coeff in enumerate(nonzero_coeffs):
            ind = j
            add += arith_expression(coeff, mat_name, ind)

        add += ';'
        write_line(myfile, 3, add)
        write_line(myfile, 2, '}')
        write_line(myfile, 1, '}')

    write_line(myfile, 0, '}')  # end of function
Beispiel #18
0
def write_mulstrassen_kernel_caller( myfile, nonzero_coeffs ):
    nnz = len( nonzero_coeffs )
    write_line( myfile, 3, 'double alpha_list[%d];' % nnz )
    write_line( myfile, 3, 'double *c_list[%d];' % nnz )
    write_line( myfile, 3, 'unsigned long long len_c=%d;' % nnz )
    add = '; '.join( [ 'alpha_list[%d]= (double)(%s)' % ( j, coeff ) for j, coeff in enumerate(nonzero_coeffs) ] )
    add += ';'
    write_line( myfile, 3, add )
    add = '; '.join( [ 'c_list[%d] = &C%d[ j * ldC + i ]' % ( j, j ) for j, coeff in enumerate(nonzero_coeffs) ] )
    add += ';'
    write_line( myfile, 3, add )
    write_line( myfile, 3, '( bl_dgemm_asm_8x4_mulstrassen ) ( k, &packA[ i * k ], &packB[ j * k ], (unsigned long long) len_c, (unsigned long long) ldC, c_list, alpha_list, &aux );' )