def create_add_functions(header, coeffs, pairwise): ''' Generate all of the custom add functions. header is the file to which we are writing coeffs is the set of all coefficients ''' def all_adds(coeffs, name, bfs_par_avail): for i, coeff_set in enumerate(coeffs): if len(coeff_set) > 0: if pairwise: write_pairwise_add_func(header, coeff_set, i + 1, name) else: write_add_func(header, coeff_set, i + 1, name, bfs_par_avail) write_break(header) # S matrices formed from A subblocks all_adds(subexpr_elim.transpose(coeffs[0]), 'S', False) # T matrices formed from B subblocks all_adds(subexpr_elim.transpose(coeffs[1]), 'T', False) # Output C formed from multiplications all_adds(coeffs[2], 'M', True) # If there was CSE, create more add functions for the temporary matrices. if len(coeffs) > 3: all_adds(coeffs[3], 'SX', True) if len(coeffs) > 4: all_adds(coeffs[4], 'TX', True) if len(coeffs) > 5: all_adds(coeffs[5], 'MX', True)
def compute_phi(coeffs): ''' Compute phi (in Bini et al. notation), which is defined as \phi = max { z | u_i^{(r)}v_j^{(r)}w_k^{(r)} = O(\epsilon^{-z}) }, where u_i^{(r)}, v_j^{(r)}, and w_k^{(r)} are entries of the rth column of U, V, and W. ''' U = transpose(coeffs[0]) V = transpose(coeffs[1]) W = transpose(coeffs[2]) def smallest_exponent(u, v, w): def min_exp(num): # num.val is a dictionary of (exponent, coefficient) all_viable = filter(lambda keyval: keyval[1] != 0, num.val.items()) if len(all_viable) == 0: return 0 return min([key for key, val in all_viable]) all_nums = [Number(x1) * Number(x2) * Number(x3) \ for x1 in u for x2 in v for x3 in w] return min([min_exp(num) for num in all_nums]) return -min([smallest_exponent(u, v, w) for u, v, w in zip(U, V, W)])
def main(): try: coeff_file = sys.argv[1] dims = tuple([int(d) for d in sys.argv[2].split(',')]) except: raise Exception( 'USAGE: python relative_quantities.py coeff_file m,k,n') full_stab_mat = 0 if len(sys.argv) > 3: full_stab_mat = sys.argv[3] coeffs = read_coeffs(coeff_file) # Using the notation from our paper a_vec = np.array( [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])]) b_vec = np.array( [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])]) e_vec = [ np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2] ] R = len(coeffs[0][0]) print R, dims[0], dims[1], dims[2], int(max(e_vec))
def main(): try: coeff_file = sys.argv[1] dims = tuple([int(d) for d in sys.argv[2].split(',')]) except: raise Exception('USAGE: python stability_vector.py coeff_file m,k,n') coeffs = read_coeffs(coeff_file) # Using the notation from our paper a_vec = np.array([ sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[0]) ]) b_vec = np.array([ sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[1]) ]) e_vec = [ np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2] ] mn = max_norm(coeffs[0]) * max_norm(coeffs[1]) * max_norm(coeffs[2]) emax = int(np.max(e_vec)) print emax, mn, emax * mn
def main(): try: coeff_file = sys.argv[1] dims = tuple([int(d) for d in sys.argv[2].split(',')]) except: raise Exception('USAGE: python stability_vector.py coeff_file m,k,n') full_stab_mat = 0 if len(sys.argv) > 3: full_stab_mat = sys.argv[3] coeffs = read_coeffs(coeff_file) # Using the notation from our paper a_vec = np.array( [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])]) b_vec = np.array( [sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])]) e_vec = [ np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2] ] alpha_vec = np.array([num_nonzero(row) for row in transpose(coeffs[0])]) beta_vec = np.array([num_nonzero(row) for row in transpose(coeffs[1])]) ab_vec = alpha_vec + beta_vec gamma_vec = [num_nonzero(row) for row in coeffs[2]] W_ind = np.array([[float(val) for val in row] for row in coeffs[2]]) W_ind[np.nonzero(W_ind)] = 1 q_vec = [gamma_vec[k] + np.max(ab_vec * W_ind[k,:]) \ for k in range(W_ind.shape[0])] # Number of additions nnz = sum([val for val in ab_vec]) + sum([val for val in gamma_vec]) # print as q_vec emax rank = len(coeffs[0][0]) mkn = dims[0] * dims[1] * dims[2] print mkn, rank, nnz, int(np.max(q_vec)), int(np.max(e_vec)) if full_stab_mat: # Print in the same style as D'Alberto presents in his 2014 paper. print 'e vector:' for i in range(dims[0]): out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])]) vals = e_vec[(i * dims[2]):(i * dims[2] + dims[2])] print out_format % tuple(vals) print 'q vector:' for i in range(dims[0]): out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])]) vals = q_vec[(i * dims[2]):(i * dims[2] + dims[2])] print out_format % tuple(vals)
def main(): try: coeff_file = sys.argv[1] dims = tuple([int(d) for d in sys.argv[2].split(',')]) except: raise Exception('USAGE: python stability_vector.py coeff_file m,k,n') full_stab_mat = 0 if len(sys.argv) > 3: full_stab_mat = sys.argv[3] coeffs = read_coeffs(coeff_file) # Using the notation from our paper a_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])]) b_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])]) e_vec = [np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2]] alpha_vec = np.array([num_nonzero(row) for row in transpose(coeffs[0])]) beta_vec = np.array([num_nonzero(row) for row in transpose(coeffs[1])]) ab_vec = alpha_vec + beta_vec gamma_vec = [num_nonzero(row) for row in coeffs[2]] W_ind = np.array([[float(val) for val in row] for row in coeffs[2]]) W_ind[np.nonzero(W_ind)] = 1 q_vec = [gamma_vec[k] + np.max(ab_vec * W_ind[k,:]) \ for k in range(W_ind.shape[0])] # Number of additions nnz = sum([val for val in ab_vec]) + sum([val for val in gamma_vec]) # print as q_vec emax rank = len(coeffs[0][0]) mkn = dims[0] * dims[1] * dims[2] print mkn, rank, nnz, int(np.max(q_vec)), int(np.max(e_vec)) if full_stab_mat: # Print in the same style as D'Alberto presents in his 2014 paper. print 'e vector:' for i in range(dims[0]): out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])]) vals = e_vec[(i * dims[2]):(i * dims[2] + dims[2])] print out_format % tuple(vals) print 'q vector:' for i in range(dims[0]): out_format = '\t' + ' '.join(['%4d' for _ in range(dims[2])]) vals = q_vec[(i * dims[2]):(i * dims[2] + dims[2])] print out_format % tuple(vals)
def need_streaming_cse_tmp(ind, coeffs, mat_dims): ''' Given an index, determine whether or not we need an extra temporary matrix for streaming additions. This occurs when we have an expression that is a eliminated through common subexpression elimination and the expression is part of a length-1 addition string in a multiplication. For example, M1 = (A11 + A12 + A13) * (B11 + B12) M2 = (A11 + A12) * (B21 + B22) We would eliminate A_X = A11 + A12 and have: M1 = (A_X + A13) * (B11 + B12) M2 = (A_X) * (B21 + B22) A_X gets used as part of a length-1 addition string for M2. ind is the index (zero-indexed linearly) of the of the matrix in the coefficient file coeffs is the set of coefficients for the U or V matrix mat_dims is 2-tuple of the matrix dimensions (A or B corresponding to U or V) ''' if ind < mat_dims[0] * mat_dims[1]: return False for col in subexpr_elim.transpose(coeffs): data = [(i, val) for i, val in enumerate(col) if is_nonzero(val)] if len(data) == 1 and data[0][0] == ind: return True return False
def main(): try: coeff_file = sys.argv[1] dims = tuple([int(d) for d in sys.argv[2].split(",")]) except: raise Exception("USAGE: python counts.py coeff_file m,k,n") coeffs = convert.read_coeffs(coeff_file) coeffs[2] = subexpr_elim.transpose(coeffs[2]) counts = [base_counts(coeffs[i]) for i in range(3)] counts_X = [(0, 0, 0) for i in range(3, 6)] if len(coeffs) > 3: counts_X = [elim_counts(coeffs[i]) for i in range(3, 6)] xtotal = tuple([sum(cnts) for cnts in zip(*(counts_X))]) total = tuple([sum(cnts) for cnts in zip(*(counts + counts_X))]) print " + r w" print "A %3d %3d %3d" % counts[0] print "B %3d %3d %3d" % counts[1] print "C %3d %3d %3d" % counts[2] print "AX %3d %3d %3d" % counts_X[0] print "BX %3d %3d %3d" % counts_X[1] print "CX %3d %3d %3d" % counts_X[2] print "xtot %3d %3d %3d" % xtotal print "tot %3d %3d %3d" % total
def main(): try: coeff_file = sys.argv[1] dims = tuple([int(d) for d in sys.argv[2].split(',')]) except: raise Exception('USAGE: python stability_vector.py coeff_file m,k,n') coeffs = read_coeffs(coeff_file) # Using the notation from our paper a_vec = np.array([sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[0])]) b_vec = np.array([sum(np.abs([zero_one(x) for x in row])) for row in transpose(coeffs[1])]) e_vec = [np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2]] mn = max_norm(coeffs[0]) * max_norm(coeffs[1]) * max_norm(coeffs[2]) emax = int(np.max(e_vec)) print emax, mn, emax * mn
def base_counts(coeff_set): """ Count the number of additions, reads, and writes for this coefficient set (A, B, or C). For A and B, these are the adds used to form the 'S' and 'T' matrices. If an S or T matrix is just a length-1 addition chain, then no additions are counted. """ cols = subexpr_elim.transpose(coeff_set) num_adds = sum([gen.num_nonzero(col) - 1 for col in cols]) num_reads = sum([gen.num_nonzero(col) for col in cols]) num_writes = sum([1 for col in cols if gen.num_nonzero(col) > 1]) return (num_adds, num_reads, num_writes)
def main(): try: coeff_file = sys.argv[1] dims = tuple([int(d) for d in sys.argv[2].split(',')]) except: raise Exception('USAGE: python relative_quantities.py coeff_file m,k,n') full_stab_mat = 0 if len(sys.argv) > 3: full_stab_mat = sys.argv[3] coeffs = read_coeffs(coeff_file) # Using the notation from our paper a_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[0])]) b_vec = np.array([sum(np.abs([float(x) for x in row])) for row in transpose(coeffs[1])]) e_vec = [np.dot(np.abs([float(x) for x in row]), a_vec * b_vec) for row in coeffs[2]] R = len(coeffs[0][0]) print R, dims[0], dims[1], dims[2], int(max(e_vec))
def streaming_additions(header, coeff_set, mat_name, tmp_name, mat_dims, is_output, num_multiplies, sub_coeffs=None): ''' Write the streaming additions for the matrices to be used in the multiplications. header is the file where the code is being generated coeff_set is the set of coefficients (corresponding to U, V, or W) mat_name is the name of the matrix we are working on (A, B, or C, that matches U, V, or W where coeff_set comes from) tmp_name is the base name to use for temporary variables (e.g., 'S' or 'T') mat_dims is the dimension of the base case matrix is_output indicates whether or not we are working on the matrix C num_multiplies is the number of multiplications, i.e., the rank of the algorithm sub_coeffs are the substitution coefficients used for common subexpression elimination (CSE). This is an optional argument, and should only be used if we are using CSE. ''' def tmp_mat_name(i): return tmp_name + str(i + 1) def subblock_name(i): return mat_name + get_suffix(i, mat_dims[0], mat_dims[1]) # Find indices of additional temporaries needed from subexpression elimination if is_output: additional_tmps = [] else: additional_tmps = [i for i in range(len(coeff_set)) if \ need_streaming_cse_tmp(i, coeff_set, mat_dims)] # All of the strides for the matrix subblocks for i in range(len(coeff_set)): subblock = subblock_name(i) if i in additional_tmps: write_line(header, 1, instantiate(subblock, mat_name)) if i < mat_dims[0] * mat_dims[1] or i in additional_tmps: write_line(header, 1, stride_call(subblock)) write_line(header, 1, data_call(subblock)) # Data for the temporary matrices if is_output: for i in xrange(num_multiplies): tmp_mat = tmp_mat_name(i) write_line(header, 1, stride_call(tmp_mat)) write_line(header, 1, data_call(tmp_mat)) else: for i, col in enumerate(subexpr_elim.transpose(coeff_set)): if need_tmp_mat(col): tmp_mat = tmp_mat_name(i) write_line(header, 1, instantiate(tmp_mat, mat_name)) write_line(header, 1, stride_call(tmp_mat)) write_line(header, 1, data_call(tmp_mat)) if not is_output: coeff_set = subexpr_elim.transpose(coeff_set) def inner_loop(handle_beta=False): write_line(header, 0, '#ifdef _PARALLEL_') write_line(header, 0, '# pragma omp parallel for') write_line(header, 0, '#endif') write_line(header, 1, 'for (int j = 0; j < %s11.n(); ++j) {' % mat_name) write_line(header, 2, 'for (int i = 0; i < %s11.m(); ++i) {' % mat_name) # Deal with substitutions from CSE if sub_coeffs != None: for i, col in enumerate(sub_coeffs): if i + mat_dims[0] * mat_dims[1] in additional_tmps: add = data_access('%s_X%d' % (mat_name, i + 1)) + ' =' else: if is_output: curr_name = tmp_name else: curr_name = mat_name add = 'Scalar %s_X%d = ' % (curr_name, i + 1) data = [(k, coeff) for k, coeff in enumerate(col) if is_nonzero(coeff)] for j, (ind, coeff) in enumerate(data): if is_output: data_name = tmp_mat_name(ind) else: data_name = subblock_name(ind) add += arith_expression(coeff, data_access(data_name), j) add += ';' write_line(header, 3, add) for i, col in enumerate(coeff_set): if need_tmp_mat(col): if is_output: add = data_access(subblock_name(i)) else: add = data_access(tmp_mat_name(i)) add += ' = ' data = [(k, coeff) for k, coeff in enumerate(col) if is_nonzero(coeff)] for j, (ind, coeff) in enumerate(data): if is_output: if ind >= num_multiplies: data_name = 'M_X' + str(ind + 1 - num_multiplies) else: data_name = tmp_mat_name(ind) else: data_name = subblock_name(ind) # Deal with subexpression elimination if (ind >= mat_dims[0] * mat_dims[1] and not is_output and ind not in additional_tmps) or \ (ind >= num_multiplies and is_output): add += arith_expression(coeff, data_name, j) else: add += arith_expression(coeff, data_access(data_name), j) if is_output and handle_beta: add += ' + beta * ' + data_access(subblock_name(i)) add += ';' write_line(header, 3, add) write_line(header, 2, '}') # end i loop write_line(header, 1, '}') # end j loop if not is_output: inner_loop() else: write_line(header, 1, 'if (beta == Scalar(0.0)) {') inner_loop() write_line(header, 1, '} else {') inner_loop(True) write_line(header, 1, '}')