コード例 #1
0
ファイル: gen.py プロジェクト: timminn/fast-matmul
def create_add_functions(header, coeffs, pairwise):
    ''' Generate all of the custom add functions.

    header is the file to which we are writing
    coeffs is the set of all coefficients
    '''
    def all_adds(coeffs, name, bfs_par_avail):
        for i, coeff_set in enumerate(coeffs):
            if len(coeff_set) > 0:
                if pairwise:
                    write_pairwise_add_func(header, coeff_set, i + 1, name)
                else:
                    write_add_func(header, coeff_set, i + 1, name,
                                   bfs_par_avail)
                write_break(header)

    # S matrices formed from A subblocks
    all_adds(subexpr_elim.transpose(coeffs[0]), 'S', False)

    # T matrices formed from B subblocks
    all_adds(subexpr_elim.transpose(coeffs[1]), 'T', False)

    # Output C formed from multiplications
    all_adds(coeffs[2], 'M', True)

    # If there was CSE, create more add functions for the temporary matrices.
    if len(coeffs) > 3:
        all_adds(coeffs[3], 'SX', True)

    if len(coeffs) > 4:
        all_adds(coeffs[4], 'TX', True)

    if len(coeffs) > 5:
        all_adds(coeffs[5], 'MX', True)
コード例 #2
0
def compute_phi(coeffs):
    ''' Compute phi (in Bini et al. notation), which is defined as
 
            \phi = max { z | u_i^{(r)}v_j^{(r)}w_k^{(r)} = O(\epsilon^{-z}) },

        where u_i^{(r)}, v_j^{(r)}, and w_k^{(r)} are entries of the rth column of U, V, and W.
    '''

    U = transpose(coeffs[0])
    V = transpose(coeffs[1])
    W = transpose(coeffs[2])

    def smallest_exponent(u, v, w):
        def min_exp(num):
            # num.val is a dictionary of (exponent, coefficient)
            all_viable = filter(lambda keyval: keyval[1] != 0, num.val.items())
            if len(all_viable) == 0:
                return 0
            return min([key for key, val in all_viable])

        all_nums = [Number(x1) * Number(x2) * Number(x3) \
                    for x1 in u for x2 in v for x3 in w]

        return min([min_exp(num) for num in all_nums])

    return -min([smallest_exponent(u, v, w) for u, v, w in zip(U, V, W)])
コード例 #3
0
def compute_phi(coeffs):
    ''' Compute phi (in Bini et al. notation), which is defined as
 
            \phi = max { z | u_i^{(r)}v_j^{(r)}w_k^{(r)} = O(\epsilon^{-z}) },

        where u_i^{(r)}, v_j^{(r)}, and w_k^{(r)} are entries of the rth column of U, V, and W.
    '''

    U = transpose(coeffs[0])
    V = transpose(coeffs[1])
    W = transpose(coeffs[2])
    
    def smallest_exponent(u, v, w):
        def min_exp(num):
            # num.val is a dictionary of (exponent, coefficient)
            all_viable = filter(lambda keyval: keyval[1] != 0, num.val.items()) 
            if len(all_viable) == 0:
                return 0
            return min([key for key, val in all_viable])

        all_nums = [Number(x1) * Number(x2) * Number(x3) \
                    for x1 in u for x2 in v for x3 in w]

        return min([min_exp(num) for num in all_nums])

    return -min([smallest_exponent(u, v, w) for u, v, w in zip(U, V, W)])
コード例 #4
0
def main():
    try:
        coeff_file = sys.argv[1]
        dims = tuple([int(d) for d in sys.argv[2].split(',')])
    except:
        raise Exception(
            'USAGE: python relative_quantities.py coeff_file m,k,n')

    full_stab_mat = 0
    if len(sys.argv) > 3:
        full_stab_mat = sys.argv[3]

    coeffs = read_coeffs(coeff_file)

    # Using the notation from our paper
    a_vec = np.array(
        [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])])
    b_vec = np.array(
        [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])])

    e_vec = [
        np.dot(np.abs([float(x) for x in row]), a_vec * b_vec)
        for row in coeffs[2]
    ]

    R = len(coeffs[0][0])

    print R, dims[0], dims[1], dims[2], int(max(e_vec))
コード例 #5
0
def main():
    try:
        coeff_file = sys.argv[1]
        dims = tuple([int(d) for d in sys.argv[2].split(',')])
    except:
        raise Exception('USAGE: python stability_vector.py coeff_file m,k,n')

    coeffs = read_coeffs(coeff_file)

    # Using the notation from our paper
    a_vec = np.array([
        sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[0])
    ])
    b_vec = np.array([
        sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[1])
    ])

    e_vec = [
        np.dot(np.abs([float(x) for x in row]), a_vec * b_vec)
        for row in coeffs[2]
    ]

    mn = max_norm(coeffs[0]) * max_norm(coeffs[1]) * max_norm(coeffs[2])
    emax = int(np.max(e_vec))

    print emax, mn, emax * mn
コード例 #6
0
def main():
    try:
        coeff_file = sys.argv[1]
        dims = tuple([int(d) for d in sys.argv[2].split(',')])
    except:
        raise Exception('USAGE: python stability_vector.py coeff_file m,k,n')

    full_stab_mat = 0
    if len(sys.argv) > 3:
        full_stab_mat = sys.argv[3]

    coeffs = read_coeffs(coeff_file)

    # Using the notation from our paper
    a_vec = np.array(
        [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])])
    b_vec = np.array(
        [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])])

    e_vec = [
        np.dot(np.abs([float(x) for x in row]), a_vec * b_vec)
        for row in coeffs[2]
    ]

    alpha_vec = np.array([num_nonzero(row) for row in transpose(coeffs[0])])
    beta_vec = np.array([num_nonzero(row) for row in transpose(coeffs[1])])
    ab_vec = alpha_vec + beta_vec
    gamma_vec = [num_nonzero(row) for row in coeffs[2]]

    W_ind = np.array([[float(val) for val in row] for row in coeffs[2]])
    W_ind[np.nonzero(W_ind)] = 1
    q_vec = [gamma_vec[k] + np.max(ab_vec * W_ind[k,:]) \
                 for k in range(W_ind.shape[0])]

    # Number of additions
    nnz = sum([val for val in ab_vec]) + sum([val for val in gamma_vec])

    # print as q_vec emax
    rank = len(coeffs[0][0])
    mkn = dims[0] * dims[1] * dims[2]
    print mkn, rank, nnz, int(np.max(q_vec)), int(np.max(e_vec))

    if full_stab_mat:
        # Print in the same style as D'Alberto presents in his 2014 paper.
        print 'e vector:'
        for i in range(dims[0]):
            out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])])
            vals = e_vec[(i * dims[2]):(i * dims[2] + dims[2])]
            print out_format % tuple(vals)
        print 'q vector:'
        for i in range(dims[0]):
            out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])])
            vals = q_vec[(i * dims[2]):(i * dims[2] + dims[2])]
            print out_format % tuple(vals)
コード例 #7
0
def main():
    try:
        coeff_file = sys.argv[1]
        dims = tuple([int(d) for d in sys.argv[2].split(',')])
    except:
        raise Exception('USAGE: python stability_vector.py coeff_file m,k,n')

    full_stab_mat = 0
    if len(sys.argv) > 3:
        full_stab_mat = sys.argv[3]

    coeffs = read_coeffs(coeff_file)

    # Using the notation from our paper
    a_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])])
    b_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])])

    e_vec = [np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2]]

    alpha_vec = np.array([num_nonzero(row) for row in transpose(coeffs[0])])
    beta_vec  = np.array([num_nonzero(row) for row in transpose(coeffs[1])])
    ab_vec = alpha_vec + beta_vec
    gamma_vec = [num_nonzero(row) for row in coeffs[2]]

    W_ind = np.array([[float(val) for val in row] for row in coeffs[2]])
    W_ind[np.nonzero(W_ind)] = 1
    q_vec = [gamma_vec[k] + np.max(ab_vec * W_ind[k,:]) \
                 for k in range(W_ind.shape[0])]

    # Number of additions
    nnz = sum([val for val in ab_vec]) + sum([val for val in gamma_vec])

    # print as q_vec emax
    rank = len(coeffs[0][0])
    mkn = dims[0] * dims[1] * dims[2]
    print mkn, rank, nnz, int(np.max(q_vec)), int(np.max(e_vec))

    if full_stab_mat:
        # Print in the same style as D'Alberto presents in his 2014 paper.
        print 'e vector:'
        for i in range(dims[0]):
            out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])])
            vals = e_vec[(i * dims[2]):(i * dims[2] + dims[2])]
            print out_format % tuple(vals)
        print 'q vector:'
        for i in range(dims[0]):
            out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])])
            vals = q_vec[(i * dims[2]):(i * dims[2] + dims[2])]
            print out_format % tuple(vals)
コード例 #8
0
ファイル: gen.py プロジェクト: timminn/fast-matmul
def need_streaming_cse_tmp(ind, coeffs, mat_dims):
    ''' Given an index, determine whether or not we need an extra temporary
    matrix for streaming additions.  This occurs when we have an expression
    that is a eliminated through common subexpression elimination and the
    expression is part of a length-1 addition string in a multiplication.
    For example,
       
       M1 = (A11 + A12 + A13) * (B11 + B12)
       M2 = (A11 + A12) * (B21 + B22)
       
    We would eliminate A_X = A11 + A12 and have:

       M1 = (A_X + A13) * (B11 + B12)
       M2 = (A_X) * (B21 + B22)

    A_X gets used as part of a length-1 addition string for M2.

    ind is the index (zero-indexed linearly) of the of the matrix in the
        coefficient file
    coeffs is the set of coefficients for the U or V matrix
    mat_dims is 2-tuple of the matrix dimensions (A or B corresponding to U or V)
    '''
    if ind < mat_dims[0] * mat_dims[1]:
        return False
    for col in subexpr_elim.transpose(coeffs):
        data = [(i, val) for i, val in enumerate(col) if is_nonzero(val)]
        if len(data) == 1 and data[0][0] == ind:
            return True

    return False
コード例 #9
0
ファイル: counts.py プロジェクト: liyancas/fast-matmul
def main():
    try:
        coeff_file = sys.argv[1]
        dims = tuple([int(d) for d in sys.argv[2].split(",")])
    except:
        raise Exception("USAGE: python counts.py coeff_file m,k,n")

    coeffs = convert.read_coeffs(coeff_file)
    coeffs[2] = subexpr_elim.transpose(coeffs[2])

    counts = [base_counts(coeffs[i]) for i in range(3)]
    counts_X = [(0, 0, 0) for i in range(3, 6)]
    if len(coeffs) > 3:
        counts_X = [elim_counts(coeffs[i]) for i in range(3, 6)]
    xtotal = tuple([sum(cnts) for cnts in zip(*(counts_X))])
    total = tuple([sum(cnts) for cnts in zip(*(counts + counts_X))])

    print "      +   r   w"
    print "A    %3d %3d %3d" % counts[0]
    print "B    %3d %3d %3d" % counts[1]
    print "C    %3d %3d %3d" % counts[2]
    print "AX   %3d %3d %3d" % counts_X[0]
    print "BX   %3d %3d %3d" % counts_X[1]
    print "CX   %3d %3d %3d" % counts_X[2]
    print "xtot %3d %3d %3d" % xtotal
    print "tot  %3d %3d %3d" % total
コード例 #10
0
def main():
    try:
        coeff_file = sys.argv[1]
        dims = tuple([int(d) for d in sys.argv[2].split(',')])
    except:
        raise Exception('USAGE: python stability_vector.py coeff_file m,k,n')

    coeffs = read_coeffs(coeff_file)

    # Using the notation from our paper
    a_vec = np.array([sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[0])])
    b_vec = np.array([sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[1])])

    e_vec = [np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2]]

    mn = max_norm(coeffs[0]) * max_norm(coeffs[1]) * max_norm(coeffs[2])
    emax = int(np.max(e_vec))

    print emax, mn, emax * mn
コード例 #11
0
ファイル: counts.py プロジェクト: liyancas/fast-matmul
def base_counts(coeff_set):
    """ Count the number of additions, reads, and writes for this coefficient set (A, B, or C).
    For A and B, these are the adds used to form the 'S' and 'T' matrices.
    If an S or T matrix is just a length-1 addition chain, then no additions are counted.
    """
    cols = subexpr_elim.transpose(coeff_set)
    num_adds = sum([gen.num_nonzero(col) - 1 for col in cols])
    num_reads = sum([gen.num_nonzero(col) for col in cols])
    num_writes = sum([1 for col in cols if gen.num_nonzero(col) > 1])
    return (num_adds, num_reads, num_writes)
コード例 #12
0
def main():
    try:
        coeff_file = sys.argv[1]
        dims = tuple([int(d) for d in sys.argv[2].split(',')])
    except:
        raise Exception('USAGE: python relative_quantities.py coeff_file m,k,n')

    full_stab_mat = 0
    if len(sys.argv) > 3:
        full_stab_mat = sys.argv[3]

    coeffs = read_coeffs(coeff_file)

    # Using the notation from our paper
    a_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])])
    b_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])])

    e_vec = [np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2]]

    R = len(coeffs[0][0])

    print R, dims[0], dims[1], dims[2], int(max(e_vec))
コード例 #13
0
ファイル: gen.py プロジェクト: timminn/fast-matmul
def streaming_additions(header,
                        coeff_set,
                        mat_name,
                        tmp_name,
                        mat_dims,
                        is_output,
                        num_multiplies,
                        sub_coeffs=None):
    '''
    Write the streaming additions for the matrices to be used in the multiplications.

    header is the file where the code is being generated
    coeff_set is the set of coefficients (corresponding to U, V, or W)
    mat_name is the name of the matrix we are working on (A, B, or C, that matches
             U, V, or W where coeff_set comes from)
    tmp_name is the base name to use for temporary variables (e.g., 'S' or 'T')
    mat_dims is the dimension of the base case matrix
    is_output indicates whether or not we are working on the matrix C
    num_multiplies is the number of multiplications, i.e., the rank of the
                   algorithm sub_coeffs are the substitution coefficients used
                   for common subexpression elimination (CSE).  This is an
                   optional argument, and should only be used if we are using CSE.
    '''
    def tmp_mat_name(i):
        return tmp_name + str(i + 1)

    def subblock_name(i):
        return mat_name + get_suffix(i, mat_dims[0], mat_dims[1])

    # Find indices of additional temporaries needed from subexpression elimination
    if is_output:
        additional_tmps = []
    else:
        additional_tmps = [i for i in range(len(coeff_set)) if \
                               need_streaming_cse_tmp(i, coeff_set, mat_dims)]

    # All of the strides for the matrix subblocks
    for i in range(len(coeff_set)):
        subblock = subblock_name(i)
        if i in additional_tmps:
            write_line(header, 1, instantiate(subblock, mat_name))
        if i < mat_dims[0] * mat_dims[1] or i in additional_tmps:
            write_line(header, 1, stride_call(subblock))
            write_line(header, 1, data_call(subblock))

    # Data for the temporary matrices
    if is_output:
        for i in xrange(num_multiplies):
            tmp_mat = tmp_mat_name(i)
            write_line(header, 1, stride_call(tmp_mat))
            write_line(header, 1, data_call(tmp_mat))
    else:
        for i, col in enumerate(subexpr_elim.transpose(coeff_set)):
            if need_tmp_mat(col):
                tmp_mat = tmp_mat_name(i)
                write_line(header, 1, instantiate(tmp_mat, mat_name))
                write_line(header, 1, stride_call(tmp_mat))
                write_line(header, 1, data_call(tmp_mat))

    if not is_output:
        coeff_set = subexpr_elim.transpose(coeff_set)

    def inner_loop(handle_beta=False):
        write_line(header, 0, '#ifdef _PARALLEL_')
        write_line(header, 0, '# pragma omp parallel for')
        write_line(header, 0, '#endif')
        write_line(header, 1,
                   'for (int j = 0; j < %s11.n(); ++j) {' % mat_name)
        write_line(header, 2,
                   'for (int i = 0; i < %s11.m(); ++i) {' % mat_name)

        # Deal with substitutions from CSE
        if sub_coeffs != None:
            for i, col in enumerate(sub_coeffs):
                if i + mat_dims[0] * mat_dims[1] in additional_tmps:
                    add = data_access('%s_X%d' % (mat_name, i + 1)) + ' ='
                else:
                    if is_output:
                        curr_name = tmp_name
                    else:
                        curr_name = mat_name
                    add = 'Scalar %s_X%d = ' % (curr_name, i + 1)

                data = [(k, coeff) for k, coeff in enumerate(col)
                        if is_nonzero(coeff)]
                for j, (ind, coeff) in enumerate(data):
                    if is_output:
                        data_name = tmp_mat_name(ind)
                    else:
                        data_name = subblock_name(ind)
                    add += arith_expression(coeff, data_access(data_name), j)

                add += ';'
                write_line(header, 3, add)

        for i, col in enumerate(coeff_set):
            if need_tmp_mat(col):
                if is_output:
                    add = data_access(subblock_name(i))
                else:
                    add = data_access(tmp_mat_name(i))
                add += ' = '

                data = [(k, coeff) for k, coeff in enumerate(col)
                        if is_nonzero(coeff)]
                for j, (ind, coeff) in enumerate(data):
                    if is_output:
                        if ind >= num_multiplies:
                            data_name = 'M_X' + str(ind + 1 - num_multiplies)
                        else:
                            data_name = tmp_mat_name(ind)
                    else:
                        data_name = subblock_name(ind)

                    # Deal with subexpression elimination
                    if (ind >= mat_dims[0] * mat_dims[1] and not is_output and ind not in additional_tmps) or \
                            (ind >= num_multiplies and is_output):
                        add += arith_expression(coeff, data_name, j)
                    else:
                        add += arith_expression(coeff, data_access(data_name),
                                                j)

                if is_output and handle_beta:
                    add += ' + beta * ' + data_access(subblock_name(i))
                add += ';'
                write_line(header, 3, add)

        write_line(header, 2, '}')  # end i loop
        write_line(header, 1, '}')  # end j loop

    if not is_output:
        inner_loop()
    else:
        write_line(header, 1, 'if (beta == Scalar(0.0)) {')
        inner_loop()
        write_line(header, 1, '} else {')
        inner_loop(True)
        write_line(header, 1, '}')