def __init__(self, inf_bound, sup_bound): self.inf_bound = inf_bound self.sup_bound = sup_bound self.zero_in_interval = 0 in sollya.Interval( inf_bound, sup_bound) self.min_exp = None if self.zero_in_interval else min( sollya.ceil(sollya.log2(abs(inf_bound))), sollya.ceil(sollya.log2(abs(sup_bound)))) self.max_exp = max(sollya.ceil(sollya.log2(abs(inf_bound))), sollya.ceil(sollya.log2(abs(sup_bound))))
def solve_format_Constant(optree): """ Legalize Constant node """ assert isinstance(optree, Constant) value = optree.get_value() if FP_SpecialValue.is_special_value(value): return optree.get_precision() elif not optree.get_precision() is None: # if precision is already set (manually forced), returns it return optree.get_precision() else: # fixed-point format solving frac_size = -1 FRAC_THRESHOLD = 100 # maximum number of frac bit to be tested # TODO: fix for i in range(FRAC_THRESHOLD): if int(value*2**i) == value * 2**i: frac_size = i break if frac_size < 0: Log.report(Log.Error, "value {} is not an integer, from node:\n{}", value, optree) abs_value = abs(value) signed = value < 0 # int_size = max(int(sollya.ceil(sollya.log2(abs_value+2**frac_size))), 0) + (1 if signed else 0) int_size = max(int(sollya.ceil(sollya.log2(abs_value + 1))), 0) + (1 if signed else 0) if frac_size == 0 and int_size == 0: int_size = 1 return fixed_point(int_size, frac_size, signed=signed)
def round_sollya_object(self, value, round_mode=sollya.RN): rnd_function = { sollya.RN: sollya.nearestint, sollya.RD: sollya.floor, sollya.RU: sollya.ceil, sollya.RZ: lambda x: sollya.floor(x) if x > 0 \ else sollya.ceil(x) }[round_mode] scale_factor = S2**self.get_frac_size() return rnd_function(scale_factor * value) / scale_factor
def __init__(self, low_exp_value, max_exp_value, field_bits, precision): self.field_bits = field_bits self.low_exp_value = low_exp_value self.max_exp_value = max_exp_value exp_bits = int( sollya.ceil(sollya.log2(max_exp_value - low_exp_value + 1))) assert exp_bits >= 0 and field_bits >= 0 and (exp_bits + field_bits) > 0 self.exp_bits = exp_bits self.split_num = (self.max_exp_value - self.low_exp_value + 1) * 2**(self.field_bits) Log.report(Log.Debug, "split_num={}", self.split_num) self.precision = precision
def solve_format_Constant(optree): """ Legalize Constant node """ assert isinstance(optree, Constant) value = optree.get_value() if FP_SpecialValue.is_special_value(value): return optree.get_precision() elif not optree.get_precision() is None: # if precision is already set (manually forced), returns it return optree.get_precision() else: # fixed-point format solving assert int(value) == value abs_value = abs(value) signed = value < 0 int_size = max(int(sollya.ceil(sollya.log2(abs_value + 1))), 0) + (1 if signed else 0) frac_size = 0 if frac_size == 0 and int_size == 0: int_size = 1 return fixed_point(int_size, frac_size, signed=signed)
def computeNeededVariableFormat(self, I, epsTarget, variableFormat): if epsTarget > 0: # TODO: fix to support ML_Binary32 if epsTarget >= self.MIN_LIMB_ERROR or variableFormat.mp_node.precision is self.limb_format: # FIXME: default to minimal precision (self.limb_format) return variableFormat else: target_accuracy = sollya.ceil(-sollya.log2(epsTarget)) target_format = self.get_format_from_accuracy( target_accuracy, eps_target=epsTarget, interval=variableFormat.mp_node.interval) if target_format.mp_node.precision.get_bit_size( ) < variableFormat.mp_node.precision.get_bit_size(): return target_format else: # if variableFormat is smaller (less bits) and more accurate # then we use it return variableFormat else: return variableFormat
def determine_minimal_fixed_format_cst(value): """ determine the minimal size format which can encode exactly the constant value value """ # fixed-point format solving frac_size = -1 FRAC_THRESHOLD = 100 # maximum number of frac bit to be tested # TODO: fix for i in range(FRAC_THRESHOLD): if int(value * 2**i) == value * 2**i: frac_size = i break if frac_size < 0: Log.report(Log.Error, "value {} is not an integer, from node:\n{}", value, optree) abs_value = abs(value) signed = value < 0 # int_size = max(int(sollya.ceil(sollya.log2(abs_value+2**frac_size))), 0) + (1 if signed else 0) int_size = max(int(sollya.ceil(sollya.log2(abs_value + 1))), 0) + (1 if signed else 0) if frac_size == 0 and int_size == 0: int_size = 1 return fixed_point(int_size, frac_size, signed=signed)
def generate_scheme(self): """ main scheme generation """ int_size = 3 frac_size = self.width - int_size input_precision = fixed_point(int_size, frac_size) output_precision = fixed_point(int_size, frac_size) expected_interval = {} # declaring main input variable var_x = self.implementation.add_input_signal("x", input_precision) x_interval = Interval(-10.3, 10.7) var_x.set_interval(x_interval) expected_interval[var_x] = x_interval var_y = self.implementation.add_input_signal("y", input_precision) y_interval = Interval(-17.9, 17.2) var_y.set_interval(y_interval) expected_interval[var_y] = y_interval var_z = self.implementation.add_input_signal("z", input_precision) z_interval = Interval(-7.3, 7.7) var_z.set_interval(z_interval) expected_interval[var_z] = z_interval cst = Constant(42.5, tag="cst") expected_interval[cst] = Interval(42.5) conv_ceil = Ceil(var_x, tag="ceil") expected_interval[conv_ceil] = sollya.ceil(x_interval) conv_floor = Floor(var_y, tag="floor") expected_interval[conv_floor] = sollya.floor(y_interval) mult = var_z * var_x mult.set_tag("mult") mult_interval = z_interval * x_interval expected_interval[mult] = mult_interval large_add = (var_x + var_y) - mult large_add.set_attributes(tag="large_add") large_add_interval = (x_interval + y_interval) - mult_interval expected_interval[large_add] = large_add_interval reduced_result = Max(0, Min(large_add, 13)) reduced_result.set_tag("reduced_result") reduced_result_interval = interval_max( Interval(0), interval_min(large_add_interval, Interval(13))) expected_interval[reduced_result] = reduced_result_interval select_result = Select(var_x > var_y, reduced_result, var_z, tag="select_result") select_interval = interval_union(reduced_result_interval, z_interval) expected_interval[select_result] = select_interval # checking interval evaluation for var in [ cst, var_x, var_y, mult, large_add, reduced_result, select_result, conv_ceil, conv_floor ]: interval = evaluate_range(var) expected = expected_interval[var] print("{}: {} vs expected {}".format(var.get_tag(), interval, expected)) assert not interval is None assert interval == expected return [self.implementation]
def generate_argument_reduction(self, memory_limit): best_arg_reduc = None best_arg_reduc = self.eval_argument_reduction(6,10,12,13) best_arg_reduc['sizeof_tables'] = best_arg_reduc['sizeof_table1'] + best_arg_reduc['sizeof_table2'] best_arg_reduc['degree_poly1'] = 4 best_arg_reduc['degree_poly2'] = 8 return best_arg_reduc # iterate through all possible parameters, and return the best argument reduction # the order of importance of the caracteristics of a good argument reduction is: # 1- the argument reduction is valid # 2- the degree of the polynomials obtains are minimals # 3- the memory used is minimal # An arument reduction is valid iff: # - the memory used is less than memory_limit # - y-1 and z-1 fit into a uint64_t # - the second argument reduction should usefull (ie: it should add at least 1 bit to the argument reduction) # From thoses validity constraint we deduce some bound on the parameters to reduce the space of value searched: # (note that thoses bound are implied by, but not equivalents to the constraints) # size1 <= log2(memory_limit/17) (memory_limit on the first table) # prec1 < 13 + size1 (y-1 fits into a uint64_t) # size2 <= log2((memory_limit - sizeof_table1)/17/midinterval) (memory_limit on both tables) # size2 >= 1 - log2(midinterval) (second arg red should be usefull) # prec2 < 12 - prec1 - log2((y-y1)/y1), for all possible y (z-1 fits into a uint64_t) # note: it is hard to deduce a tight bound on prec2 from the last inequality # a good approximation is size2 ~= max[for y]( - log2((y-y1)/y1)), but using it may eliminate valid arg reduc #self.eval_argument_reduction(12, 20, 22, 14) min_size1 = 1 max_size1 = floor(log(memory_limit/17)/log(2)).getConstantAsInt() for size1 in xrange(max_size1, min_size1-1, -1): min_prec1 = size1 max_prec1 = 12 + size1 for prec1 in xrange(min_prec1,max_prec1+1): # we need sizeof_table1 and mid_interval for the bound on size2 and prec2 first_arg_reduc = self.eval_argument_reduction(size1, prec1, prec1, prec1) mid_interval = first_arg_reduc['mid_interval'] sizeof_table1 = first_arg_reduc['sizeof_table1'] if not(0 <= inf(mid_interval) and sup(mid_interval) < S2**(64 - 52 - prec1)): continue if not(first_arg_reduc['sizeof_table1'] < memory_limit): continue min_size2 = 1 - ceil(log(sup(mid_interval))/log(2)).getConstantAsInt() max_size2 = floor(log((memory_limit - sizeof_table1)/(17 * sup(mid_interval)))/log(2)).getConstantAsInt() # during execution of the prec2 loop, it can reduces the interval of valid values for prec2 # so min_prec2 and max_prec2 are setted here and not before the the prec2 loop # (because they are modified inside the body of the loop, for the next iteration of size2) min_prec2 = 0 max_prec2 = 12 + max_size2 - prec1 for size2 in xrange(max_size2,min_size2-1,-1): max_prec2 = min(max_prec2, 12 + size2 - prec1) for prec2 in xrange(max_prec2,min_prec2-1,-1): #print '=====\t\033[1m{}\033[0m({}/{}),\t\033[1m{}\033[0m({}/{}),\t\033[1m{}\033[0m({}/{}),\t\033[1m{}\033[0m({}/{})\t====='.format(size1,min_size1,max_size1,prec1,min_prec1,max_prec1,size2,min_size2,max_size2,prec2,min_prec2,max_prec2) #print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss #memory used by the programm arg_reduc = self.eval_argument_reduction(size1, prec1, size2, prec2) mid_interval = arg_reduc['mid_interval'] out_interval = arg_reduc['out_interval'] sizeof_tables = arg_reduc['sizeof_table1'] + arg_reduc['sizeof_table2'] if not(0 <= inf(out_interval) and sup(out_interval) < S2**(64-52-prec1-prec2)): max_prec2 = prec2 - 1 continue if memory_limit < sizeof_tables: continue #assert(prec2 < 12 + size2 - prec1) # test the approximation size2 ~= max[for y]( - log2((y-y1)/y1)) # guess the degree of the two polynomials (relative error <= 2^-52 and absolute error <= 2^-120) # note: we exclude zero from out_interval to not perturb sollya (log(1+x)/x is not well defined on 0) sollya_out_interval = Interval(S2**(-52-prec1-prec2), sup(out_interval)) guess_degree_poly1 = guessdegree(log(1+sollya.x)/sollya.x, sollya_out_interval, S2**-52) guess_degree_poly2 = guessdegree(log(1+sollya.x), sollya_out_interval, S2**-120) # TODO: detect when guessdegree return multiple possible degree, and find the right one if False and inf(guess_degree_poly1) <> sup(guess_degree_poly1): print "improvable guess_degree_poly1:", guess_degree_poly1 if False and inf(guess_degree_poly2) <> sup(guess_degree_poly2): print "improvable guess_degree_poly2:", guess_degree_poly2 degree_poly1 = sup(guess_degree_poly1).getConstantAsInt() + 1 degree_poly2 = sup(guess_degree_poly2).getConstantAsInt() if ((best_arg_reduc is not None) and (best_arg_reduc['degree_poly1'] < degree_poly1 or best_arg_reduc['degree_poly2'] < degree_poly2)): min_prec2 = prec2 + 1 break if ((best_arg_reduc is None) or (best_arg_reduc['degree_poly1'] > degree_poly1) or (best_arg_reduc['degree_poly1'] == degree_poly1 and best_arg_reduc['degree_poly2'] > degree_poly2) or (best_arg_reduc['degree_poly1'] == degree_poly1 and best_arg_reduc['degree_poly2'] == degree_poly2 and best_arg_reduc['sizeof_tables'] > sizeof_tables)): arg_reduc['degree_poly1'] = degree_poly1 arg_reduc['degree_poly2'] = degree_poly2 arg_reduc['sizeof_tables'] = sizeof_tables best_arg_reduc = arg_reduc #print "\n --new best-- \n", arg_reduc, "\n" #print "\nBest arg reduc: \n", best_arg_reduc, "\n" return best_arg_reduc
def generate_scalar_scheme(self, vx): Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") index_size = 5 comp_lo = (vx < 0) comp_lo.set_attributes(tag = "comp_lo", precision = ML_Bool) sign = Select(comp_lo, -1, 1, precision = self.precision) # as sinh is an odd function, we can simplify the input to its absolute # value once the sign has been extracted vx = Abs(vx) int_precision = self.precision.get_integer_format() # argument reduction arg_reg_value = log(2)/2**index_size inv_log2_value = round(1/arg_reg_value, self.precision.get_sollya_object(), sollya.RN) inv_log2_cst = Constant(inv_log2_value, precision = self.precision, tag = "inv_log2") # for r_hi to be accurate we ensure k * log2_hi_value_cst is exact # by limiting the number of non-zero bits in log2_hi_value_cst # cosh(x) ~ exp(abs(x))/2 for a big enough x # cosh(x) > 2^1023 <=> exp(x) > 2^1024 <=> x > log(2^1024) # k = inv_log2_value * x # -1 for guard max_k_approx = inv_log2_value * log(sollya.SollyaObject(2)**1024) max_k_bitsize = int(ceil(log2(max_k_approx))) Log.report(Log.Info, "max_k_bitsize: %d" % max_k_bitsize) log2_hi_value_precision = self.precision.get_precision() - max_k_bitsize - 1 log2_hi_value = round(arg_reg_value, log2_hi_value_precision, sollya.RN) log2_lo_value = round(arg_reg_value - log2_hi_value, self.precision.get_sollya_object(), sollya.RN) log2_hi_value_cst = Constant(log2_hi_value, tag = "log2_hi_value", precision = self.precision) log2_lo_value_cst = Constant(log2_lo_value, tag = "log2_lo_value", precision = self.precision) k = Trunc(Multiplication(inv_log2_cst, vx), precision = self.precision) k_log2 = Multiplication(k, log2_hi_value_cst, precision = self.precision, exact = True, tag = "k_log2", unbreakable = True) r_hi = vx - k_log2 r_hi.set_attributes(tag = "r_hi", debug = debug_multi, unbreakable = True) r_lo = -k * log2_lo_value_cst # reduced argument r = r_hi + r_lo r.set_attributes(tag = "r", debug = debug_multi) if is_gappa_installed(): r_eval_error = self.get_eval_error(r_hi, variable_copy_map = { vx: Variable("vx", interval = Interval(0, 715), precision = self.precision), k: Variable("k", interval = Interval(0, 1024), precision = self.precision) }) Log.report(Log.Verbose, "r_eval_error: ", r_eval_error) approx_interval = Interval(-arg_reg_value, arg_reg_value) error_goal_approx = 2**-(self.precision.get_precision()) poly_degree = sup(guessdegree(exp(sollya.x), approx_interval, error_goal_approx)) + 3 precision_list = [1] + [self.precision] * (poly_degree) k_integer = Conversion(k, precision = int_precision, tag = "k_integer", debug = debug_multi) k_hi = BitLogicRightShift(k_integer, Constant(index_size, precision=int_precision), tag = "k_int_hi", precision = int_precision, debug = debug_multi) k_lo = Modulo(k_integer, 2**index_size, tag = "k_int_lo", precision = int_precision, debug = debug_multi) pow_exp = ExponentInsertion(Conversion(k_hi, precision = int_precision), precision = self.precision, tag = "pow_exp", debug = debug_multi) exp_table = ML_NewTable(dimensions = [2 * 2**index_size, 4], storage_precision = self.precision, tag = self.uniquify_name("exp2_table")) for i in range(2 * 2**index_size): input_value = i - 2**index_size if i >= 2**index_size else i reduced_hi_prec = int(self.precision.get_mantissa_size() - 8) # using SollyaObject wrapper to force evaluation by sollya # with higher precision exp_value = sollya.SollyaObject(2)**((input_value)* 2**-index_size) mexp_value = sollya.SollyaObject(2)**((-input_value)* 2**-index_size) pos_value_hi = round(exp_value, reduced_hi_prec, sollya.RN) pos_value_lo = round(exp_value - pos_value_hi, self.precision.get_sollya_object(), sollya.RN) neg_value_hi = round(mexp_value, reduced_hi_prec, sollya.RN) neg_value_lo = round(mexp_value - neg_value_hi, self.precision.get_sollya_object(), sollya.RN) exp_table[i][0] = neg_value_hi exp_table[i][1] = neg_value_lo exp_table[i][2] = pos_value_hi exp_table[i][3] = pos_value_lo # log2_value = log(2) / 2^index_size # sinh(x) = 1/2 * (exp(x) - exp(-x)) # exp(x) = exp(x - k * log2_value + k * log2_value) # # r = x - k * log2_value # exp(x) = exp(r) * 2 ^ (k / 2^index_size) # # k / 2^index_size = h + l * 2^-index_size, with k, h, l integers # exp(x) = exp(r) * 2^h * 2^(l *2^-index_size) # # sinh(x) = exp(r) * 2^(h-1) * 2^(l *2^-index_size) - exp(-r) * 2^(-h-1) * 2^(-l *2^-index_size) # S=2^(h-1), T = 2^(-h-1) # exp(r) = 1 + poly_pos(r) # exp(-r) = 1 + poly_neg(r) # 2^(l / 2^index_size) = pos_value_hi + pos_value_lo # 2^(-l / 2^index_size) = neg_value_hi + neg_value_lo # error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error(exp(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function = error_function) Log.report(Log.Verbose, "poly_approx_error: {}, {}".format(poly_approx_error, float(log2(poly_approx_error)))) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly_pos = polynomial_scheme_builder(poly_object.sub_poly(start_index = 1), r, unified_precision = self.precision) poly_pos.set_attributes(tag = "poly_pos", debug = debug_multi) poly_neg = polynomial_scheme_builder(poly_object.sub_poly(start_index = 1), -r, unified_precision = self.precision) poly_neg.set_attributes(tag = "poly_neg", debug = debug_multi) table_index = Addition(k_lo, Constant(2**index_size, precision = int_precision), precision = int_precision, tag = "table_index", debug = debug_multi) neg_value_load_hi = TableLoad(exp_table, table_index, 0, tag = "neg_value_load_hi", debug = debug_multi) neg_value_load_lo = TableLoad(exp_table, table_index, 1, tag = "neg_value_load_lo", debug = debug_multi) pos_value_load_hi = TableLoad(exp_table, table_index, 2, tag = "pos_value_load_hi", debug = debug_multi) pos_value_load_lo = TableLoad(exp_table, table_index, 3, tag = "pos_value_load_lo", debug = debug_multi) k_plus = Max( Subtraction(k_hi, Constant(1, precision = int_precision), precision=int_precision, tag="k_plus", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision = int_precision)) k_neg = Max( Subtraction(-k_hi, Constant(1, precision=int_precision), precision=int_precision, tag="k_neg", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision = int_precision)) # 2^(h-1) pow_exp_pos = ExponentInsertion(k_plus, precision = self.precision, tag="pow_exp_pos", debug=debug_multi) # 2^(-h-1) pow_exp_neg = ExponentInsertion(k_neg, precision = self.precision, tag="pow_exp_neg", debug=debug_multi) hi_terms = (pos_value_load_hi * pow_exp_pos - neg_value_load_hi * pow_exp_neg) hi_terms.set_attributes(tag = "hi_terms", debug=debug_multi) pos_exp = (pos_value_load_hi * poly_pos + (pos_value_load_lo + pos_value_load_lo * poly_pos)) * pow_exp_pos pos_exp.set_attributes(tag = "pos_exp", debug = debug_multi) neg_exp = (neg_value_load_hi * poly_neg + (neg_value_load_lo + neg_value_load_lo * poly_neg)) * pow_exp_neg neg_exp.set_attributes(tag = "neg_exp", debug = debug_multi) result = Addition( Subtraction( pos_exp, neg_exp, precision=self.precision, ), hi_terms, precision=self.precision, tag="result", debug=debug_multi ) # ov_value ov_value = round(asinh(self.precision.get_max_value()), self.precision.get_sollya_object(), sollya.RD) ov_flag = Comparison(Abs(vx), Constant(ov_value, precision = self.precision), specifier = Comparison.Greater) # main scheme scheme = Statement( Return( Select( ov_flag, sign*FP_PlusInfty(self.precision), sign*result ))) return scheme
def generate_payne_hanek(vx, frac_pi, precision, n=100, k=4, chunk_num=None, debug=False): """ generate payne and hanek argument reduction for frac_pi * variable """ sollya.roundingwarnings = sollya.off debug_precision = debug_multi int_precision = {ML_Binary32: ML_Int32, ML_Binary64: ML_Int64}[precision] p = precision.get_field_size() # weight of the most significant digit of the constant cst_msb = floor(log2(abs(frac_pi))) # length of exponent range which must be covered by the approximation # of the constant cst_exp_range = cst_msb - precision.get_emin_subnormal() + 1 # chunk size has to be so than multiplication by a splitted <v> # (vx_hi or vx_lo) is exact chunk_size = precision.get_field_size() / 2 - 2 chunk_number = int(ceil((cst_exp_range + chunk_size - 1) / chunk_size)) scaling_factor = S2**-(chunk_size / 2) chunk_size_cst = Constant(chunk_size, precision=ML_Int32) cst_msb_node = Constant(cst_msb, precision=ML_Int32) # Saving sollya's global precision old_global_prec = sollya.settings.prec sollya.settings.prec(cst_exp_range + n) # table to store chunk of constant multiplicand cst_table = ML_NewTable(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_cst_table") # table to store sqrt(scaling_factor) corresponding to the # cst multiplicand chunks scale_table = ML_NewTable(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_scale_table") tmp_cst = frac_pi # cst_table stores normalized constant chunks (they have been # scale back to close to 1.0 interval) # # scale_table stores the scaling factors corresponding to the # denormalization of cst_table coefficients # this loop divide the digits of frac_pi into chunks # the chunk lsb weight is given by a shift from # cst_msb, multiple of the chunk index for i in range(chunk_number): value_div_factor = S2**(chunk_size * (i + 1) - cst_msb) local_cst = int(tmp_cst * value_div_factor) / value_div_factor local_scale = (scaling_factor**i) # storing scaled constant chunks cst_table[i][0] = local_cst / (local_scale**2) scale_table[i][0] = local_scale # Updating constant value tmp_cst = tmp_cst - local_cst # Computing which part of the constant we do not need to multiply # In the following comments, vi represents the bit of frac_pi of weight 2**-i # Bits vi so that i <= (vx_exp - p + 1 -k) are not needed, because they result # in a multiple of 2pi and do not contribute to trig functions. vx_exp = ExponentExtraction( vx, precision=vx.get_precision().get_integer_format()) vx_exp = Conversion(vx_exp, precision=ML_Int32) msb_exp = -(vx_exp - p + 1 - k) msb_exp.set_attributes(tag="msb_exp", debug=debug_multi) msb_exp = Conversion(msb_exp, precision=ML_Int32) # Select the highest index where the reduction should start msb_index = Select(cst_msb_node < msb_exp, 0, (cst_msb_node - msb_exp) / chunk_size_cst) msb_index.set_attributes(tag="msb_index", debug=debug_multi) # For a desired accuracy of 2**-n, bits vi so that i >= (vx_exp + n + 4) are not needed, because they contribute less than # 2**-n to the result lsb_exp = -(vx_exp + n + 4) lsb_exp.set_attributes(tag="lsb_exp", debug=debug_multi) lsb_exp = Conversion(lsb_exp, precision=ML_Int32) # Index of the corresponding chunk lsb_index = (cst_msb_node - lsb_exp) / chunk_size_cst lsb_index.set_attributes(tag="lsb_index", debug=debug_multi) # Splitting vx half_size = precision.get_field_size() / 2 + 1 # hi part (most significant digit) of vx input vx_hi = TypeCast(BitLogicAnd( TypeCast(vx, precision=int_precision), Constant(~int(2**half_size - 1), precision=int_precision)), precision=precision) vx_hi.set_attributes(tag="vx_hi_ph") #, debug = debug_multi) vx_lo = vx - vx_hi vx_lo.set_attributes(tag="vx_lo_ph") #, debug = debug_multi) # loop iterator variable vi = Variable("i", precision=ML_Int32, var_type=Variable.Local) # step scaling factor half_scaling = Constant(S2**(-chunk_size / 2), precision=precision) i1 = Constant(1, precision=ML_Int32) # accumulator to the output precision acc = Variable("acc", precision=precision, var_type=Variable.Local) # integer accumulator acc_int = Variable("acc_int", precision=int_precision, var_type=Variable.Local) init_loop = Statement( vx_hi, vx_lo, ReferenceAssign(vi, msb_index), ReferenceAssign(acc, Constant(0, precision=precision)), ReferenceAssign(acc_int, Constant(0, precision=int_precision)), ) cst_load = TableLoad(cst_table, vi, 0, tag="cst_load", debug=debug_precision) sca_load = TableLoad(scale_table, vi, 0, tag="sca_load", debug=debug_precision) # loop body # hi_mult = vx_hi * <scale_factor> * <cst> hi_mult = (vx_hi * sca_load) * (cst_load * sca_load) hi_mult.set_attributes(tag="hi_mult", debug=debug_precision) pre_hi_mult_int = NearestInteger(hi_mult, precision=int_precision, tag="hi_mult_int", debug=(debuglld if debug else None)) hi_mult_int_f = Conversion(pre_hi_mult_int, precision=precision, tag="hi_mult_int_f", debug=debug_precision) pre_hi_mult_red = (hi_mult - hi_mult_int_f).modify_attributes( tag="hi_mult_red", debug=debug_precision) # for the first chunks (vx_hi * <constant chunk>) exceeds 2**k+1 and may be # discard (whereas it may lead to overflow during integer conversion pre_exclude_hi = ((cst_msb_node - (vi + i1) * chunk_size + i1) + (vx_exp + Constant(-half_size + 1, precision=ML_Int32)) ).modify_attributes(tag="pre_exclude_hi", debug=(debugd if debug else None)) pre_exclude_hi.propagate_precision(ML_Int32, [cst_msb_node, vi, vx_exp, i1]) Ck = Constant(k, precision=ML_Int32) exclude_hi = pre_exclude_hi <= Ck exclude_hi.set_attributes(tag="exclude_hi", debug=debug_multi) hi_mult_red = Select(exclude_hi, pre_hi_mult_red, Constant(0, precision=precision)) hi_mult_int = Select(exclude_hi, pre_hi_mult_int, Constant(0, precision=int_precision)) # lo part of the chunk reduction lo_mult = (vx_lo * sca_load) * (cst_load * sca_load) lo_mult.set_attributes(tag="lo_mult") #, debug = debug_multi) lo_mult_int = NearestInteger(lo_mult, precision=int_precision, tag="lo_mult_int") #, debug = debug_multi lo_mult_int_f = Conversion(lo_mult_int, precision=precision, tag="lo_mult_int_f") #, debug = debug_multi) lo_mult_red = (lo_mult - lo_mult_int_f).modify_attributes( tag="lo_mult_red") #, debug = debug_multi) # accumulating fractional part acc_expr = (acc + hi_mult_red) + lo_mult_red # accumulating integer part int_expr = ((acc_int + hi_mult_int) + lo_mult_int) % 2**(k + 1) CF1 = Constant(1, precision=precision) CI1 = Constant(1, precision=int_precision) # extracting exceeding integer part in fractionnal accumulator acc_expr_int = NearestInteger(acc_expr, precision=int_precision) # normalizing integer and fractionnal accumulator by subtracting then # adding exceeding integer part normalization = Statement( ReferenceAssign( acc, acc_expr - Conversion(acc_expr_int, precision=precision)), ReferenceAssign(acc_int, int_expr + acc_expr_int), ) acc_expr.set_attributes(tag="acc_expr") #, debug = debug_multi) int_expr.set_attributes(tag="int_expr") #, debug = debug_multi) red_loop = Loop( init_loop, vi <= lsb_index, Statement(acc_expr, int_expr, normalization, ReferenceAssign(vi, vi + 1))) result = Statement(lsb_index, msb_index, red_loop) # restoring sollya's global precision sollya.settings.prec = old_global_prec return result, acc, acc_int
def generate_payne_hanek(vx, frac_pi, precision, n=100, k=4, chunk_num=None, debug=False): """ generate payne and hanek argument reduction for frac_pi * variable """ # determining integer format corresponding to # floating point precision argument int_precision = {ML_Binary64: ML_Int64, ML_Binary32: ML_Int32}[precision] cst_msb = floor(log2(abs(frac_pi))) cst_exp_range = cst_msb - precision.get_emin_subnormal() + 1 # chunk size has to be so than multiplication by a splitted <v> (vx_hi or vx_lo) # is exact chunk_size = 20 # precision.get_field_size() / 2 - 2 chunk_number = int(ceil((cst_exp_range + chunk_size - 1) / chunk_size)) scaling_factor = S2**-(chunk_size / 2) chunk_size_cst = Constant(chunk_size, precision=ML_Int32) cst_msb_node = Constant(cst_msb, precision=ML_Int32) p = precision.get_field_size() # adapting debug format to precision argument debug_precision = { ML_Binary32: debug_ftox, ML_Binary64: debug_lftolx }[precision] if debug else None # saving sollya's global precision old_global_prec = get_prec() prec(cst_exp_range + 100) # table to store chunk of constant multiplicand cst_table = ML_Table(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_cst_table") # table to store sqrt(scaling_factor) corresponding to the cst multiplicand chunks scale_table = ML_Table(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_scale_table") tmp_cst = frac_pi # this loop divide the digits of frac_pi into chunks # the chunk lsb weight is given by a shift from # cst_msb, multiple of the chunk index for i in xrange(chunk_number): value_div_factor = S2**(chunk_size * (i + 1) - cst_msb) local_cst = int(tmp_cst * value_div_factor) / value_div_factor local_scale = (scaling_factor**i) # storing scaled constant chunks cst_table[i][0] = local_cst / (local_scale**2) scale_table[i][0] = local_scale tmp_cst = tmp_cst - local_cst vx_exp = ExponentExtraction(vx) msb_exp = -vx_exp + p - 1 + k msb_exp.set_attributes(tag="msb_exp", debug=(debugd if debug else None)) msb_index = Select(cst_msb_node < msb_exp, 0, (cst_msb_node - msb_exp) / chunk_size_cst) msb_index.set_attributes(tag="msb_index", debug=(debugd if debug else None)) lsb_exp = -vx_exp + p - 1 - n lsb_exp.set_attributes(tag="lsb_exp", debug=(debugd if debug else None)) lsb_index = (cst_msb_node - lsb_exp) / chunk_size_cst lsb_index.set_attributes(tag="lsb_index", debug=(debugd if debug else None)) half_size = precision.get_field_size() / 2 + 1 vx_hi = TypeCast(BitLogicAnd( TypeCast(vx, precision=ML_Int64), Constant(~(2**half_size - 1), precision=ML_Int64)), precision=precision) vx_hi.set_attributes(tag="vx_hi", debug=debug_precision) vx_lo = vx - vx_hi vx_lo.set_attributes(tag="vx_lo", debug=debug_precision) vi = Variable("i", precision=ML_Int32, var_type=Variable.Local) half_scaling = Constant(S2**(-chunk_size / 2), precision=precision) i1 = Constant(1, precision=ML_Int32) acc = Variable("acc", precision=precision, var_type=Variable.Local) acc_int = Variable("acc_int", precision=int_precision, var_type=Variable.Local) init_loop = Statement( vx_hi, vx_lo, ReferenceAssign(vi, msb_index), ReferenceAssign(acc, Constant(0, precision=precision)), ReferenceAssign(acc_int, Constant(0, precision=precision)), ) cst_load = TableLoad(cst_table, vi, 0, tag="cst_load", debug=debug_precision) sca_load = TableLoad(scale_table, vi, 0, tag="sca_load", debug=debug_precision) hi_mult = (vx_hi * sca_load) * (cst_load * sca_load) hi_mult.set_attributes(tag="hi_mult", debug=debug_precision) pre_hi_mult_int = NearestInteger(hi_mult, precision=int_precision, tag="hi_mult_int", debug=(debuglld if debug else None)) hi_mult_int_f = Conversion(pre_hi_mult_int, precision=precision, tag="hi_mult_int_f", debug=debug_precision) pre_hi_mult_red = (hi_mult - hi_mult_int_f).modify_attributes( tag="hi_mult_red", debug=debug_precision) # for the first chunks (vx_hi * <constant chunk>) exceeds 2**k+1 and may be # discard (whereas it may lead to overflow during integer conversion pre_exclude_hi = ((cst_msb_node - (vi + i1) * chunk_size + i1) + (vx_exp + Constant(-half_size + 1, precision=ML_Int32)) ).modify_attributes(tag="pre_exclude_hi", debug=(debugd if debug else None)) pre_exclude_hi.propagate_precision(ML_Int32, [cst_msb_node, vi, vx_exp, i1]) Ck = Constant(k, precision=ML_Int32) exclude_hi = pre_exclude_hi <= Ck exclude_hi.set_attributes(tag="exclude_hi", debug=(debugd if debug else None)) hi_mult_red = Select(exclude_hi, pre_hi_mult_red, Constant(0, precision=precision)) hi_mult_int = Select(exclude_hi, pre_hi_mult_int, Constant(0, precision=int_precision)) lo_mult = (vx_lo * sca_load) * (cst_load * sca_load) lo_mult.set_attributes(tag="lo_mult", debug=debug_precision) lo_mult_int = NearestInteger(lo_mult, precision=int_precision, tag="lo_mult_int", debug=(debuglld if debug else None)) lo_mult_int_f = Conversion(lo_mult_int, precision=precision, tag="lo_mult_int_f", debug=debug_precision) lo_mult_red = (lo_mult - lo_mult_int_f).modify_attributes( tag="lo_mult_red", debug=debug_precision) acc_expr = (acc + hi_mult_red) + lo_mult_red int_expr = ((acc_int + hi_mult_int) + lo_mult_int) % 2**(k + 1) CF1 = Constant(1, precision=precision) CI1 = Constant(1, precision=int_precision) acc_expr_int = NearestInteger(acc_expr, precision=int_precision) normalization = Statement( ReferenceAssign( acc, acc_expr - Conversion(acc_expr_int, precision=precision)), ReferenceAssign(acc_int, int_expr + acc_expr_int), ) acc_expr.set_attributes(tag="acc_expr", debug=debug_precision) int_expr.set_attributes(tag="int_expr", debug=(debuglld if debug else None)) red_loop = Loop( init_loop, vi <= lsb_index, Statement( acc_expr, int_expr, normalization, #ReferenceAssign(acc, acc_expr), #ReferenceAssign(acc_int, int_expr), ReferenceAssign(vi, vi + 1))) result = Statement(lsb_index, msb_index, red_loop) # restoring sollya's global precision prec(old_global_prec) return result, acc, acc_int
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = self.implementation.add_input_variable("x", self.precision) Log.report(Log.Info, "target: %s " % self.processor.target_name) # display parameter information Log.report(Log.Info, "accuracy : %s " % self.accuracy) Log.report(Log.Info, "input interval: %s " % self.input_interval) accuracy_goal = self.accuracy.get_goal() Log.report(Log.Info, "accuracy_goal=%f" % accuracy_goal) table_size_log = self.table_size_log integer_size = 31 integer_precision = ML_Int32 max_bound = sup(abs(self.input_interval)) max_bound_log = int(ceil(log2(max_bound))) Log.report(Log.Info, "max_bound_log=%s " % max_bound_log) scaling_power = integer_size - max_bound_log Log.report(Log.Info, "scaling power: %s " % scaling_power) storage_precision = ML_Custom_FixedPoint_Format(1, 30, signed=True) Log.report(Log.Info, "tabulating cosine and sine") # cosine and sine fused table fused_table = ML_NewTable( dimensions=[2**table_size_log, 2], storage_precision=storage_precision, tag="fast_lib_shared_table") # self.uniquify_name("cossin_table")) # filling table for i in range(2**table_size_log): local_x = i / S2**table_size_log * S2**max_bound_log cos_local = cos( local_x ) # nearestint(cos(local_x) * S2**storage_precision.get_frac_size()) sin_local = sin( local_x ) # nearestint(sin(local_x) * S2**storage_precision.get_frac_size()) fused_table[i][0] = cos_local fused_table[i][1] = sin_local # argument reduction evaluation scheme # scaling_factor = Constant(S2**scaling_power, precision = self.precision) red_vx_precision = ML_Custom_FixedPoint_Format(31 - scaling_power, scaling_power, signed=True) Log.report( Log.Verbose, "red_vx_precision.get_c_bit_size()=%d" % red_vx_precision.get_c_bit_size()) # red_vx = NearestInteger(vx * scaling_factor, precision = integer_precision) red_vx = Conversion(vx, precision=red_vx_precision, tag="red_vx", debug=debug_fixed32) computation_precision = red_vx_precision # self.precision output_precision = self.io_precisions[0] Log.report(Log.Info, "computation_precision is %s" % computation_precision) Log.report(Log.Info, "storage_precision is %s" % storage_precision) Log.report(Log.Info, "output_precision is %s" % output_precision) hi_mask_value = 2**32 - 2**(32 - table_size_log - 1) hi_mask = Constant(hi_mask_value, precision=ML_Int32) Log.report(Log.Info, "hi_mask=0x%x" % hi_mask_value) red_vx_hi_int = BitLogicAnd(TypeCast(red_vx, precision=ML_Int32), hi_mask, precision=ML_Int32, tag="red_vx_hi_int", debug=debugd) red_vx_hi = TypeCast(red_vx_hi_int, precision=red_vx_precision, tag="red_vx_hi", debug=debug_fixed32) red_vx_lo = red_vx - red_vx_hi red_vx_lo.set_attributes(precision=red_vx_precision, tag="red_vx_lo", debug=debug_fixed32) table_index = BitLogicRightShift(TypeCast(red_vx, precision=ML_Int32), scaling_power - (table_size_log - max_bound_log), precision=ML_Int32, tag="table_index", debug=debugd) tabulated_cos = TableLoad(fused_table, table_index, 0, tag="tab_cos", precision=storage_precision, debug=debug_fixed32) tabulated_sin = TableLoad(fused_table, table_index, 1, tag="tab_sin", precision=storage_precision, debug=debug_fixed32) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) Log.report(Log.Info, "building polynomial approximation for cosine") # cosine polynomial approximation poly_interval = Interval(0, S2**(max_bound_log - table_size_log)) Log.report(Log.Info, "poly_interval=%s " % poly_interval) cos_poly_degree = 2 # int(sup(guessdegree(cos(x), poly_interval, accuracy_goal))) Log.report(Log.Verbose, "cosine polynomial approximation") cos_poly_object, cos_approx_error = Polynomial.build_from_approximation_with_error( cos(x), [0, 2], [0] + [computation_precision.get_bit_size()], poly_interval, sollya.absolute, error_function=error_function) #cos_eval_scheme = PolynomialSchemeEvaluator.generate_horner_scheme(cos_poly_object, red_vx_lo, unified_precision = computation_precision) Log.report(Log.Info, "cos_approx_error=%e" % cos_approx_error) cos_coeff_list = cos_poly_object.get_ordered_coeff_list() coeff_C0 = cos_coeff_list[0][1] coeff_C2 = Constant(cos_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) Log.report(Log.Info, "building polynomial approximation for sine") # sine polynomial approximation sin_poly_degree = 2 # int(sup(guessdegree(sin(x)/x, poly_interval, accuracy_goal))) Log.report(Log.Info, "sine poly degree: %e" % sin_poly_degree) Log.report(Log.Verbose, "sine polynomial approximation") sin_poly_object, sin_approx_error = Polynomial.build_from_approximation_with_error( sin(sollya.x) / sollya.x, [0, 2], [0] + [computation_precision.get_bit_size()] * (sin_poly_degree + 1), poly_interval, sollya.absolute, error_function=error_function) sin_coeff_list = sin_poly_object.get_ordered_coeff_list() coeff_S0 = sin_coeff_list[0][1] coeff_S2 = Constant(sin_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) # scheme selection between sine and cosine if self.cos_output: scheme = self.generate_cos_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) else: scheme = self.generate_sin_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) result = Conversion(scheme, precision=self.io_precisions[0]) Log.report( Log.Verbose, "result operation tree :\n %s " % result.get_str( display_precision=True, depth=None, memoization_map={})) scheme = Statement(Return(result)) return scheme
def generate_scheme(self): """ main scheme generation """ int_size = 3 frac_size = self.width - int_size input_precision = fixed_point(int_size, frac_size) output_precision = fixed_point(int_size, frac_size) expected_interval = {} # declaring main input variable var_x = self.implementation.add_input_signal("x", input_precision) x_interval = Interval(-10.3,10.7) var_x.set_interval(x_interval) expected_interval[var_x] = x_interval var_y = self.implementation.add_input_signal("y", input_precision) y_interval = Interval(-17.9,17.2) var_y.set_interval(y_interval) expected_interval[var_y] = y_interval var_z = self.implementation.add_input_signal("z", input_precision) z_interval = Interval(-7.3,7.7) var_z.set_interval(z_interval) expected_interval[var_z] = z_interval cst = Constant(42.5, tag = "cst") expected_interval[cst] = Interval(42.5) conv_ceil = Ceil(var_x, tag = "ceil") expected_interval[conv_ceil] = sollya.ceil(x_interval) conv_floor = Floor(var_y, tag = "floor") expected_interval[conv_floor] = sollya.floor(y_interval) mult = var_z * var_x mult.set_tag("mult") mult_interval = z_interval * x_interval expected_interval[mult] = mult_interval large_add = (var_x + var_y) - mult large_add.set_attributes(tag = "large_add") large_add_interval = (x_interval + y_interval) - mult_interval expected_interval[large_add] = large_add_interval var_x_lzc = CountLeadingZeros(var_x, tag="var_x_lzc") expected_interval[var_x_lzc] = Interval(0, input_precision.get_bit_size()) reduced_result = Max(0, Min(large_add, 13)) reduced_result.set_tag("reduced_result") reduced_result_interval = interval_max( Interval(0), interval_min( large_add_interval, Interval(13) ) ) expected_interval[reduced_result] = reduced_result_interval select_result = Select( var_x > var_y, reduced_result, var_z, tag = "select_result" ) select_interval = interval_union(reduced_result_interval, z_interval) expected_interval[select_result] = select_interval # floating-point operation on mantissa and exponents fp_x_range = Interval(-0.01, 100) unbound_fp_var = Variable("fp_x", precision=ML_Binary32, interval=fp_x_range) mant_fp_x = MantissaExtraction(unbound_fp_var, tag="mant_fp_x", precision=ML_Binary32) exp_fp_x = ExponentExtraction(unbound_fp_var, tag="exp_fp_x", precision=ML_Int32) ins_exp_fp_x = ExponentInsertion(exp_fp_x, tag="ins_exp_fp_x", precision=ML_Binary32) expected_interval[unbound_fp_var] = fp_x_range expected_interval[exp_fp_x] = Interval( sollya.floor(sollya.log2(sollya.inf(abs(fp_x_range)))), sollya.floor(sollya.log2(sollya.sup(abs(fp_x_range)))) ) expected_interval[mant_fp_x] = Interval(1, 2) expected_interval[ins_exp_fp_x] = Interval( S2**sollya.inf(expected_interval[exp_fp_x]), S2**sollya.sup(expected_interval[exp_fp_x]) ) # checking interval evaluation for var in [var_x_lzc, exp_fp_x, unbound_fp_var, mant_fp_x, ins_exp_fp_x, cst, var_x, var_y, mult, large_add, reduced_result, select_result, conv_ceil, conv_floor]: interval = evaluate_range(var) expected = expected_interval[var] print("{}: {}".format(var.get_tag(), interval)) print(" vs expected {}".format(expected)) assert not interval is None assert interval == expected return [self.implementation]
def get_value_exp(value): """ return the binary exponent of value """ return sollya.ceil(sollya.log2(abs(value)))
def ulp(v, format_): """ return a 'unit in last place' value for <v> assuming precision is defined by format _ """ return sollya.S2**(sollya.ceil(sollya.log2(sollya.abs(v))) - (format_.get_precision() + 1))
def generate_scheme(self): ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = self.precision # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) # rounding mode input rnd_mode = self.implementation.add_input_signal( "rnd_mode", rnd_mode_format) # size of most significant table index (for linear slope tabulation) alpha = self.alpha #Â 6 # size of medium significant table index (for initial value table index LSB) beta = self.beta # 5 # size of least significant table index (for linear offset tabulation) gamma = self.gamma # 5 guard_bits = self.guard_bits # 3 vx.set_interval(self.interval) range_hi = sollya.sup(self.interval) range_lo = sollya.inf(self.interval) f_hi = self.function(range_hi) f_lo = self.function(range_lo) # fixed by format used for reduced_x range_size = range_hi - range_lo range_size_log2 = int(sollya.log2(range_size)) assert 2**range_size_log2 == range_size print("range_size_log2={}".format(range_size_log2)) reduced_x = Conversion(BitLogicRightShift(vx - range_lo, range_size_log2), precision=fixed_point(0, alpha + beta + gamma, signed=False), tag="reduced_x", debug=debug_fixed) alpha_index = get_fixed_slice(reduced_x, 0, alpha - 1, align_hi=FixedPointPosition.FromMSBToLSB, align_lo=FixedPointPosition.FromMSBToLSB, tag="alpha_index", debug=debug_std) gamma_index = get_fixed_slice(reduced_x, gamma - 1, 0, align_hi=FixedPointPosition.FromLSBToLSB, align_lo=FixedPointPosition.FromLSBToLSB, tag="gamma_index", debug=debug_std) beta_index = get_fixed_slice(reduced_x, alpha, gamma, align_hi=FixedPointPosition.FromMSBToLSB, align_lo=FixedPointPosition.FromLSBToLSB, tag="beta_index", debug=debug_std) # Assuming monotonic function f_absmax = max(abs(f_hi), abs(f_lo)) f_absmin = min(abs(f_hi), abs(f_lo)) f_msb = int(sollya.ceil(sollya.log2(f_absmax))) + 1 f_lsb = int(sollya.floor(sollya.log2(f_absmin))) storage_lsb = f_lsb - io_precision.get_bit_size() - guard_bits f_int_size = f_msb f_frac_size = -storage_lsb storage_format = fixed_point(f_int_size, f_frac_size, signed=False) Log.report(Log.Info, "storage_format is {}".format(storage_format)) # table of initial value index tiv_index = Concatenation(alpha_index, beta_index, tag="tiv_index", debug=debug_std) # table of offset value index to_index = Concatenation(alpha_index, gamma_index, tag="to_index", debug=debug_std) tiv_index_size = alpha + beta to_index_size = alpha + gamma Log.report(Log.Info, "initial table structures") table_iv = ML_NewTable(dimensions=[2**tiv_index_size], storage_precision=storage_format, tag="tiv") table_offset = ML_NewTable(dimensions=[2**to_index_size], storage_precision=storage_format, tag="to") slope_table = [None] * (2**alpha) slope_delta = 1.0 / sollya.SollyaObject(2**alpha) delta_u = range_size * slope_delta * 2**-15 Log.report(Log.Info, "computing slope value") for i in range(2**alpha): # slope is computed at the middle of range_size interval slope_x = range_lo + (i + 0.5) * range_size * slope_delta # TODO: gross approximation of derivatives f_xpu = self.function(slope_x + delta_u / 2) f_xmu = self.function(slope_x - delta_u / 2) slope = (f_xpu - f_xmu) / delta_u slope_table[i] = slope range_rcp_steps = 1.0 / sollya.SollyaObject(2**tiv_index_size) Log.report(Log.Info, "computing value for initial-value table") for i in range(2**tiv_index_size): slope_index = i / 2**beta iv_x = range_lo + i * range_rcp_steps * range_size offset_x = 0.5 * range_rcp_steps * range_size # initial value is computed so that the piecewise linear # approximation intersects the function at iv_x + offset_x iv_y = self.function( iv_x + offset_x) - offset_x * slope_table[int(slope_index)] initial_value = storage_format.round_sollya_object(iv_y) table_iv[i] = initial_value # determining table of initial value interval tiv_min = table_iv[0] tiv_max = table_iv[0] for i in range(1, 2**tiv_index_size): tiv_min = min(tiv_min, table_iv[i]) tiv_max = max(tiv_max, table_iv[i]) table_iv.set_interval(Interval(tiv_min, tiv_max)) offset_step = range_size / S2**(alpha + beta + gamma) for i in range(2**alpha): Log.report(Log.Info, "computing offset value for sub-table {}".format(i)) for j in range(2**gamma): to_i = i * 2**gamma + j offset = slope_table[i] * j * offset_step table_offset[to_i] = offset # determining table of offset interval to_min = table_offset[0] to_max = table_offset[0] for i in range(1, 2**(alpha + gamma)): to_min = min(to_min, table_offset[i]) to_max = max(to_max, table_offset[i]) offset_interval = Interval(to_min, to_max) table_offset.set_interval(offset_interval) initial_value = TableLoad(table_iv, tiv_index, precision=storage_format, tag="initial_value", debug=debug_fixed) offset_precision = get_fixed_type_from_interval(offset_interval, 16) print("offset_precision is {} ({} bits)".format( offset_precision, offset_precision.get_bit_size())) table_offset.get_precision().storage_precision = offset_precision # rounding table value for i in range(1, 2**(alpha + gamma)): table_offset[i] = offset_precision.round_sollya_object( table_offset[i]) offset_value = TableLoad(table_offset, to_index, precision=offset_precision, tag="offset_value", debug=debug_fixed) Log.report( Log.Verbose, "initial_value's interval: {}, offset_value's interval: {}".format( evaluate_range(initial_value), evaluate_range(offset_value))) final_add = initial_value + offset_value round_bit = final_add # + FixedPointPosition(final_add, io_precision.get_bit_size(), align=FixedPointPosition.FromMSBToLSB) vr_out = Conversion(initial_value + offset_value, precision=io_precision, tag="vr_out", debug=debug_fixed) self.implementation.add_output_signal("vr_out", vr_out) # Approximation error evaluation approx_error = 0.0 for i in range(2**alpha): for j in range(2**beta): tiv_i = (i * 2**beta + j) # = range_lo + tiv_i * range_rcp_steps * range_size iv = table_iv[tiv_i] for k in range(2**gamma): to_i = i * 2**gamma + k offset = table_offset[to_i] approx_value = offset + iv table_x = range_lo + range_size * ( (i * 2**beta + j) * 2**gamma + k) / S2**(alpha + beta + gamma) local_error = abs(1 / (table_x) - approx_value) approx_error = max(approx_error, local_error) error_log2 = float(sollya.log2(approx_error)) print("approx_error is {}, error_log2 is {}".format( float(approx_error), error_log2)) # table size table_iv_size = 2**(alpha + beta) table_offset_size = 2**(alpha + gamma) print("tables' size are {} entries".format(table_iv_size + table_offset_size)) return [self.implementation]
def __init__(self, precision = ML_Binary32, abs_accuracy = S2**-24, libm_compliant = True, debug_flag = False, fuse_fma = True, fast_path_extract = True, target = GenericProcessor(), output_file = "expf.c", function_name = "expf"): # declaring target and instantiating optimization engine processor = target self.precision = precision opt_eng = OptimizationEngine(processor) gappacg = GappaCodeGenerator(processor, declare_cst = True, disable_debug = True) # declaring CodeFunction and retrieving input variable self.function_name = function_name exp_implementation = CodeFunction(self.function_name, output_format = self.precision) vx = exp_implementation.add_input_variable("x", self.precision) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = True, tag = "nan_or_inf") test_nan = Test(vx, specifier = Test.IsNaN, debug = True, tag = "is_nan_test") test_positive = Comparison(vx, 0, specifier = Comparison.GreaterOrEqual, debug = True, tag = "inf_sign") test_signaling_nan = Test(vx, specifier = Test.IsSignalingNaN, debug = True, tag = "is_signaling_nan") return_snan = Statement(ExpRaiseReturn(ML_FPE_Invalid, return_value = FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement(ConditionBlock(test_positive, Return(FP_PlusInfty(self.precision)), Return(FP_PlusZero(self.precision)))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock(test_nan, ConditionBlock(test_signaling_nan, return_snan, Return(FP_QNaN(self.precision))), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely = False, specifier = Comparison.Greater) early_overflow_return = Statement(ClearException(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value = FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2 ** precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely = False, specifier = Comparison.Less) early_underflow_return = Statement(ClearException(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value = FP_PlusZero(self.precision))) sollya_prec_map = {ML_Binary32: sollya.binary32, ML_Binary64: sollya.binary64} # constant computation invlog2 = round(1/log(2), sollya_prec_map[self.precision], RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - (ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: "), log2_hi_precision invlog2_cst = Constant(invlog2, precision = self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = round(log(2) - log2_hi, sollya_prec_map[self.precision], sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag = "unround_k", debug = ML_Debug(display_format = "%f")) k = NearestInteger(unround_k, precision = self.precision, debug = ML_Debug(display_format = "%f")) ik = NearestInteger(unround_k, precision = ML_Int32, debug = ML_Debug(display_format = "%d"), tag = "ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact= True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact = True) r = exact_hi_part - k * log2_lo r.set_tag("r") r.set_attributes(debug = ML_Debug(display_format = "%f")) opt_r = opt_eng.optimization_process(r, self.precision, copy = True, fuse_fma = fuse_fma) tag_map = {} opt_eng.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision = self.precision, interval = interval_vx), tag_map["k"]: Variable("k", interval = interval_k, precision = self.precision) } #try: if 1: #eval_error = gappacg.get_eval_error(opt_r, cg_eval_error_copy_map, gappa_filename = "red_arg.g") eval_error = gappacg.get_eval_error_v2(opt_eng, opt_r, cg_eval_error_copy_map, gappa_filename = "red_arg.g") Log.report(Log.Info, "eval error: %s" % eval_error) #except: # Log.report(Log.Info, "gappa error evaluation failed") print r.get_str(depth = None, display_precision = True, display_attribute = True) print opt_r.get_str(depth = None, display_precision = True, display_attribute = True) approx_interval = Interval(-log(2)/2, log(2)/2) local_ulp = sup(ulp(exp(approx_interval), self.precision)) print "ulp: ", local_ulp error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = sup(guessdegree(exp(x), approx_interval, error_goal_approx)) #- 1 init_poly_degree = poly_degree return while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error(exp(x), poly_degree, [self.precision]*(poly_degree+1), approx_interval, absolute) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report(Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, r, unified_precision = self.precision) poly.set_tag("poly") # optimizing poly before evaluation error computation opt_poly = opt_eng.optimization_process(poly, self.precision) #print "poly: ", poly.get_str(depth = None, display_precision = True) #print "opt_poly: ", opt_poly.get_str(depth = None, display_precision = True) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision = self.precision, interval = approx_interval) poly_error_copy_map = { r.get_handle().get_node(): r_gappa_var } gappacg = GappaCodeGenerator(target, declare_cst = False, disable_debug = True) poly_eval_error = gappacg.get_eval_error_v2(opt_eng, poly.get_handle().get_node(), poly_error_copy_map, gappa_filename = "gappa_poly.g") Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) global_poly_error = poly_eval_error + poly_approx_error global_rel_poly_error = global_poly_error / exp(approx_interval) print "global_poly_error: ", global_poly_error, global_rel_poly_error flag = local_ulp > sup(abs(global_rel_poly_error)) print "test: ", flag if flag: break else: if poly_degree > init_poly_degree + 5: Log.report(Log.Error, "poly degree search did not converge") poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier = Comparison.Greater, likely = False, debug = True, tag = "late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = ik - overflow_exp_offset diff_k.set_attributes(debug = ML_Debug(display_format = "%d"), tag = "diff_k") late_overflow_result = (ExponentInsertion(diff_k) * poly) * ExponentInsertion(overflow_exp_offset) late_overflow_result.set_attributes(silent = False, tag = "late_overflow_result", debug = debugf) late_overflow_return = ConditionBlock(Test(late_overflow_result, specifier = Test.IsInfty, likely = False), ExpRaiseReturn(ML_FPE_Overflow, return_value = FP_PlusInfty(self.precision)), Return(late_overflow_result)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier = Comparison.LessOrEqual, likely = False) underflow_exp_offset = 2 * self.precision.get_field_size() late_underflow_result = (ExponentInsertion(ik + underflow_exp_offset) * poly) * ExponentInsertion(-underflow_exp_offset) late_underflow_result.set_attributes(debug = ML_Debug(display_format = "%e"), tag = "late_underflow_result", silent = False) test_subnormal = Test(late_underflow_result, specifier = Test.IsSubnormal) late_underflow_return = Statement(ConditionBlock(test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value = late_underflow_result)), Return(late_underflow_result)) std_result = poly * ExponentInsertion(ik, tag = "exp_ik", debug = debug_lftolx) std_result.set_attributes(tag = "std_result", debug = debug_lftolx) result_scheme = ConditionBlock(late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result))) std_return = ConditionBlock(early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock(test_nan_or_inf, Statement(ClearException(), specific_return), std_return) #print scheme.get_str(depth = None, display_precision = True) # fusing FMA if fuse_fma: Log.report(Log.Info, "\033[33;1m MDL fusing FMA \033[0m") scheme = opt_eng.fuse_multiply_add(scheme, silence = True) Log.report(Log.Info, "\033[33;1m MDL abstract scheme \033[0m") opt_eng.instantiate_abstract_precision(scheme, None) Log.report(Log.Info, "\033[33;1m MDL instantiated scheme \033[0m") opt_eng.instantiate_precision(scheme, default_precision = self.precision) Log.report(Log.Info, "\033[33;1m subexpression sharing \033[0m") opt_eng.subexpression_sharing(scheme) Log.report(Log.Info, "\033[33;1m silencing operation \033[0m") opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation exp_implementation.set_scheme(scheme) # check processor support Log.report(Log.Info, "\033[33;1m checking processor support \033[0m") opt_eng.check_processor_support(scheme) # factorizing fast path if fast_path_extract: Log.report(Log.Info, "\033[33;1m factorizing fast path\033[0m") opt_eng.factorize_fast_path(scheme) Log.report(Log.Info, "\033[33;1m generating source code \033[0m") cg = CCodeGenerator(processor, declare_cst = False, disable_debug = not debug_flag, libm_compliant = libm_compliant) self.result = exp_implementation.get_definition(cg, C_Code, static_cst = True) #self.result.add_header("support_lib/ml_types.h") self.result.add_header("support_lib/ml_special_values.h") self.result.add_header_comment("polynomial degree for exp(x): %d" % poly_degree) self.result.add_header_comment("sollya polynomial for exp(x): %s" % poly_object.get_sollya_object()) if debug_flag: self.result.add_header("stdio.h") self.result.add_header("inttypes.h") output_stream = open(output_file, "w")#"%s.c" % exp_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def generate_scheme(self): memory_limit = 2500 # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = input_var kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) ### Constants computations ### v_log2_hi = nearestint(log(2) * 2**-52) * 2**52 v_log2_lo = round(log(2) - v_log2_hi, 64+53, sollya.RN) log2_hi = Constant(v_log2_hi, precision = self.precision, tag = "log2_hi") log2_lo = Constant(v_log2_lo, precision = self.precision, tag = "log2_lo") print "\n\033[1mSearch parameters for the argument reduction:\033[0m (this can take a while)" arg_reduc = self.generate_argument_reduction(memory_limit) print "\n\033[1mArgument reduction found:\033[0m [({},{}),({},{})] -> polynomials of degree {},{}, using {} bytes of memory".format(arg_reduc['size1'],arg_reduc['prec1'],arg_reduc['size2'],arg_reduc['prec2'],arg_reduc['degree_poly1'],arg_reduc['degree_poly2'],arg_reduc['sizeof_tables']) print "\n\033[1mGenerate the first logarithm table:\033[0m containing {} elements, using {} bytes of memory".format(arg_reduc['length_table1'], arg_reduc['sizeof_table1']) inv_table_1 = ML_Table(dimensions = [arg_reduc['length_table1']], storage_precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec1'], False), tag = self.uniquify_name("inv_table_1")) log_table_1 = ML_Table(dimensions = [arg_reduc['length_table1']], storage_precision = ML_Custom_FixedPoint_Format(11, 128-11, False), tag = self.uniquify_name("log_table_1")) for i in xrange(0, arg_reduc['length_table1']-1): x1 = 1 + i/S2*arg_reduc['size1'] inv_x1 = ceil(S2**arg_reduc['prec1']/x1)*S2**arg_reduc['prec1'] log_x1 = floor(log(x1) * S2**(128-11))*S2**(11-128) inv_table_1[i] = inv_x1 #Constant(inv_x1, precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec1'], False)) log_table_1[i] = log_x1 #Constant(log_x1, precision = ML_Custom_FixedPoint_Format(11, 128-11, False)) print "\n\033[1mGenerate the second logarithm table:\033[0m containing {} elements, using {} bytes of memory".format(arg_reduc['length_table2'], arg_reduc['sizeof_table2']) inv_table_2 = ML_Table(dimensions = [arg_reduc['length_table2']], storage_precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec2'], False), tag = self.uniquify_name("inv_table_2")) log_table_2 = ML_Table(dimensions = [arg_reduc['length_table2']], storage_precision = ML_Custom_FixedPoint_Format(11, 128-11, False), tag = self.uniquify_name("log_table_2")) for i in xrange(0, arg_reduc['length_table2']-1): y1 = 1 + i/S2**arg_reduc['size2'] inv_y1 = ceil(S2**arg_reduc['prec2']/x1) * S2**arg_reduc['prec2'] log_y1 = floor(log(inv_y1) * S2**(128-11))*S2**(11-128) inv_table_2[i] = inv_y1 #Constant(inv_y1, precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec2'], False)) log_table_2[i] = log_y1 #Constant(log_y1, precision = ML_Custom_FixedPoint_Format(11, 128-11, False)) ### Evaluation Scheme ### print "\n\033[1mGenerate the evaluation scheme:\033[0m" input_var = self.implementation.add_input_variable("input_var", self.precision) ve = ExponentExtraction(input_var, tag = "x_exponent", debug = debugd) vx = MantissaExtraction(input_var, tag = "x_mantissa", precision = ML_Custom_FixedPoint_Format(0,52,False), debug = debug_lftolx) #vx = MantissaExtraction(input_var, tag = "x_mantissa", precision = self.precision, debug = debug_lftolx) print "filtering and handling special cases" test_is_special_cases = LogicalNot(Test(input_var, specifier = Test.IsIEEENormalPositive, likely = True, debug = debugd, tag = "is_special_cases")) handling_special_cases = Statement( ConditionBlock( Test(input_var, specifier = Test.IsSignalingNaN, debug = True), ExpRaiseReturn(ML_FPE_Invalid, return_value = FP_QNaN(self.precision)) ), ConditionBlock( Test(input_var, specifier = Test.IsNaN, debug = True), Return(input_var) )#, # TODO: add tests for x == 0 (raise DivideByZero, return -Inf), x < 0 (raise InvalidOperation, return qNaN) # all that remains is x is a subnormal positive #Statement( # ReferenceAssign(Dereference(ve), Subtraction(ve, Subtraction(CountLeadingZeros(input_var, tag = 'subnormal_clz', precision = ve.get_precision()), Constant(12, precision = ve.get_precision())))), # ReferenceAssign(Dereference(vx), BitLogicLeftShift(vx, Addition(CountLeadingZeros(input_var, tag = 'subnormal_clz', precision = ve.get_precision()), Constant(1, precision = ve.get_precision())))) #) ) print "doing the argument reduction" v_dx = vx v_x1 = Conversion(v_dx, tag = 'x1', precision = ML_Custom_FixedPoint_Format(0,arg_reduc['size1'],False), rounding_mode = ML_RoundTowardMinusInfty) v_index_x = TypeCast(v_x1, tag = 'index_x', precision = ML_Int32) #ML_Custom_FixedPoint_Format(v_x1.get_precision().get_c_bit_size(), 0, False)) v_inv_x = TableLoad(inv_table_1, v_index_x, tag = 'inv_x') v_x = Addition(v_dx, 1, tag = 'x', precision = ML_Custom_FixedPoint_Format(1,52,False)) v_dy = Multiplication(v_x, v_inv_x, tag = 'dy', precision = ML_Custom_FixedPoint_Format(0,52+arg_reduc['prec1'],False)) v_y1 = Conversion(v_dy, tag = 'y1', precision = ML_Custom_FixedPoint_Format(0,arg_reduc['size2'],False), rounding_mode = ML_RoundTowardMinusInfty) v_index_y = TypeCast(v_y1, tag = 'index_y', precision = ML_Int32) #ML_Custom_FixedPoint_Format(v_y1.get_precision().get_c_bit_size(), 0, False)) v_inv_y = TableLoad(inv_table_2, v_index_y, tag = 'inv_y') v_y = Addition(v_dy, 1, tag = 'y', precision = ML_Custom_FixedPoint_Format(1,52+arg_reduc['prec2'],False)) # note that we limit the number of bits used to represent dz to 64. # we proved during the arg reduction that we can do that (sup(out_interval) < 2^(64-52-prec1-prec2)) v_dz = Multiplication(v_y, v_inv_y, tag = 'z', precision = ML_Custom_FixedPoint_Format(64-52-arg_reduc['prec1']-arg_reduc['prec2'],52+arg_reduc['prec1']+arg_reduc['prec2'],False)) # reduce the number of bits used to represent dz. we can do that print "doing the first polynomial evaluation" global_poly1_object = Polynomial.build_from_approximation(log(1+sollya.x)/sollya.x, arg_reduc['degree_poly1']-1, [64] * (arg_reduc['degree_poly1']), arg_reduc['out_interval'], fixed, sollya.absolute) poly1_object = global_poly1_object.sub_poly(start_index = 1) print global_poly1_object print poly1_object poly1 = PolynomialSchemeEvaluator.generate_horner_scheme(poly1_object, v_dz, unified_precision = v_dz.get_precision()) return ConditionBlock(test_is_special_cases, handling_special_cases, Return(poly1)) #approx_interval = Interval(0, 27021597764222975*S2**-61) #poly_degree = 1+sup(guessdegree(log(1+x)/x, approx_interval, S2**-(self.precision.get_field_size()))) #global_poly_object = Polynomial.build_from_approximation(log(1+x)/x, poly_degree, [1] + [self.precision]*(poly_degree), approx_interval, sollya.absolute) #poly_object = global_poly_object.sub_poly(start_index = 1) #_poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, _red_vx, unified_precision = self.precision) #_poly.set_attributes(tag = "poly", debug = debug_lftolx) """
def generate_scheme(self): ## Generate Fused multiply and add comput <x> . <y> + <z> Log.report( Log.Info, "generating fixed MPFMA with {ed} extra digit(s) and sign-magnitude accumulator: {sm}" .format(ed=self.extra_digit, sm=self.sign_magnitude)) def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = HdlVirtualFormat(self.precision) # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) # reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable # maximum weigth for a mantissa product digit max_prod_exp = self.precision.get_emax() * 2 + 1 # minimum wieght for a mantissa product digit min_prod_exp = self.precision.get_emin_subnormal() * 2 ## Most and least significant digit index for the # accumulator acc_msb_index = max_prod_exp + self.extra_digit acc_lsb_index = min_prod_exp acc_width = acc_msb_index - min_prod_exp + 1 # precision of the accumulator acc_prec = ML_StdLogicVectorFormat(acc_width) reset = self.implementation.add_input_signal("reset", ML_StdLogic) vx = self.implementation.add_input_signal("x", io_precision) vy = self.implementation.add_input_signal("y", io_precision) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() acc = self.implementation.add_input_signal("acc", acc_prec) if self.sign_magnitude: # the accumulator is in sign-magnitude representation sign_acc = self.implementation.add_input_signal( "sign_acc", ML_StdLogic) else: sign_acc = CopySign(acc, precision=ML_StdLogic, tag="sign_acc", debug=debug_std) vx_precision = self.precision vy_precision = self.precision result_precision = acc_prec # precision for first operand vx which is to be statically # positionned p = vx_precision.get_mantissa_size() # precision for second operand vy which is to be dynamically shifted q = vy_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) exp_vy_precision = ML_StdLogicVectorFormat( vy_precision.get_exponent_size()) mant_vx_precision = ML_StdLogicVectorFormat(p - 1) mant_vy_precision = ML_StdLogicVectorFormat(q - 1) mant_vx = MantissaExtraction(vx, precision=mant_vx_precision) mant_vy = MantissaExtraction(vy, precision=mant_vy_precision) exp_vx = ExponentExtraction(vx, precision=exp_vx_precision, tag="exp_vx", debug=debug_dec) exp_vy = ExponentExtraction(vy, precision=exp_vy_precision, tag="exp_vy", debug=debug_dec) # Maximum number of leading zero for normalized <vx> mantissa L_x = 0 # Maximum number of leading zero for normalized <vy> mantissa L_y = 0 # Maximum number of leading zero for the product of <x>.<y> # mantissa. L_xy = L_x + L_y + 1 sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') sign_xy = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="sign_xy", debug=ML_Debug(display_format="-radix 2")) effective_op = BitLogicXor(sign_xy, sign_acc, precision=ML_StdLogic, tag="effective_op", debug=ML_Debug(display_format="-radix 2")) exp_vx_bias = vx_precision.get_bias() exp_vy_bias = vy_precision.get_bias() # <acc> is statically positionned in the datapath, # it may even constitute the whole datapath # # the product is shifted with respect to the fix accumulator exp_bias = (exp_vx_bias + exp_vy_bias) # because of the mantissa range [1, 2[, the product exponent # is located one bit to the right (lower) of the product MSB prod_exp_offset = 1 # Determine a working precision to accomodate exponent difference # FIXME: check interval and exponent operations size exp_precision_ext_size = max( vx_precision.get_exponent_size(), vy_precision.get_exponent_size(), abs(ceil(log2(abs(acc_msb_index)))), abs(ceil(log2(abs(acc_lsb_index)))), abs(ceil(log2(abs(exp_bias + prod_exp_offset)))), ) + 2 Log.report(Log.Info, "exp_precision_ext_size={}".format(exp_precision_ext_size)) exp_precision_ext = ML_StdLogicVectorFormat(exp_precision_ext_size) # static accumulator exponent exp_acc = Constant(acc_msb_index, precision=exp_precision_ext, tag="exp_acc", debug=debug_cst_dec) # Y is first aligned offset = max(o+L_y,q) + 2 bits to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + offset # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction( exp_acc, Addition(Addition(zext( exp_vy, exp_precision_ext_size - vy_precision.get_exponent_size()), zext( exp_vx, exp_precision_ext_size - vx_precision.get_exponent_size()), precision=exp_precision_ext), Constant(exp_bias + prod_exp_offset, precision=exp_precision_ext, tag="diff_bias", debug=debug_cst_dec), precision=exp_precision_ext, tag="pre_exp_diff", debug=debug_dec), precision=exp_precision_ext, tag="exp_diff", debug=debug_dec) signed_exp_diff = SignCast(exp_diff, specifier=SignCast.Signed, precision=exp_precision_ext) datapath_full_width = acc_width # the maximum exp diff is the size of the datapath # minus the bit size of the product max_exp_diff = datapath_full_width - (p + q) exp_diff_lt_0 = Comparison(signed_exp_diff, Constant(0, precision=exp_precision_ext), specifier=Comparison.Less, precision=ML_Bool, tag="exp_diff_lt_0", debug=debug_std) exp_diff_gt_max_diff = Comparison(signed_exp_diff, Constant( max_exp_diff, precision=exp_precision_ext), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(max_exp_diff)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_max_diff, Constant(max_exp_diff, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=ML_Debug(display_format="-radix 10")) prod_prec = ML_StdLogicVectorFormat(p + q) prod = Multiplication(mant_vx, mant_vy, precision=prod_prec, tag="prod", debug=debug_std) # attempt at pipelining the operator # self.implementation.start_new_stage() mant_ext_size = datapath_full_width - (p + q) shift_prec = ML_StdLogicVectorFormat(datapath_full_width) shifted_prod = BitLogicRightShift(rzext(prod, mant_ext_size), mant_shift, precision=shift_prec, tag="shifted_prod", debug=debug_std) ## Inserting a pipeline stage after the product shifting if self.pipelined: self.implementation.start_new_stage() if self.sign_magnitude: # the accumulator is in sign-magnitude representation acc_negated = Select(Comparison(sign_xy, sign_acc, specifier=Comparison.Equal, precision=ML_Bool), acc, BitLogicNegate(acc, precision=acc_prec), precision=acc_prec) # one extra MSB bit is added to the final addition # to detect overflows add_width = acc_width + 1 add_prec = ML_StdLogicVectorFormat(add_width) # FIXME: implement with a proper compound adder mant_add_p0_ext = Addition(zext(shifted_prod, 1), zext(acc_negated, 1), precision=add_prec) mant_add_p1_ext = Addition( mant_add_p0_ext, Constant(1, precision=ML_StdLogic), precision=add_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) # discarding carry overflow bit mant_add_p0 = SubSignalSelection(mant_add_p0_ext, 0, acc_width - 1, precision=acc_prec) mant_add_p1 = SubSignalSelection(mant_add_p1_ext, 0, acc_width - 1, precision=acc_prec) mant_add_pre_sign = CopySign(mant_add_p1_ext, precision=ML_StdLogic, tag="mant_add_pre_sign", debug=debug_std) mant_add = Select(Comparison(sign_xy, sign_acc, specifier=Comparison.Equal, precision=ML_Bool), mant_add_p0, Select( Comparison(mant_add_pre_sign, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), mant_add_p1, BitLogicNegate(mant_add_p0, precision=acc_prec), precision=acc_prec, ), precision=acc_prec, tag="mant_add") # if both operands had the same sign, then # mant_add is necessarily positive and the result # sign matches the input sign # if both operands had opposite signs, then # the result sign matches the product sign # if mant_add is positive, else the accumulator sign output_sign = Select( Comparison(effective_op, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), # if the effective op is a subtraction (prod - acc) BitLogicXor(sign_acc, mant_add_pre_sign, precision=ML_StdLogic), # the effective op is an addition, thus result and # acc share sign sign_acc, precision=ML_StdLogic, tag="output_sign") if self.pipelined: self.implementation.start_new_stage() # adding output self.implementation.add_output_signal("vr_sign", output_sign) self.implementation.add_output_signal("vr_acc", mant_add) else: # 2s complement encoding of the accumulator, # the accumulator is never negated, only the producted # is negated if negative # negate shifted prod when required shifted_prod_op = Select(Comparison(sign_xy, Constant( 1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), Negation(shifted_prod, precision=shift_prec), shifted_prod, precision=shift_prec) add_prec = shift_prec # ML_StdLogicVectorFormat(datapath_full_width + 1) mant_add = Addition(shifted_prod_op, acc, precision=acc_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) if self.pipelined: self.implementation.start_new_stage() self.implementation.add_output_signal("vr_acc", mant_add) return [self.implementation]
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name if self.libm_compliant: return RaiseReturn(*args, precision=self.precision, **kwords) else: return Return(kwords["return_value"], precision=self.precision) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=debug_multi, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=debug_multi, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=debug_multi, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock( test_positive, Return(FP_PlusInfty(self.precision), precision=self.precision), Return(FP_PlusZero(self.precision), precision=self.precision))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock( test_signaling_nan, return_snan, Return(FP_QNaN(self.precision), precision=self.precision)), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = sollya.ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely=False, specifier=Comparison.Greater) early_overflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2**precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely=False, specifier=Comparison.Less) early_underflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value=FP_PlusZero(self.precision))) # constant computation invlog2 = self.precision.round_sollya_object(1 / log(2), sollya.RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), sollya.ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - ( sollya.ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: %d" % log2_hi_precision) invlog2_cst = Constant(invlog2, precision=self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = self.precision.round_sollya_object( log(2) - log2_hi, sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag="unround_k", debug=debug_multi) k = NearestInteger(unround_k, precision=self.precision, debug=debug_multi) ik = NearestInteger(unround_k, precision=self.precision.get_integer_format(), debug=debug_multi, tag="ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact=True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact=True, tag="exact_hi", debug=debug_multi, prevent_optimization=True) exact_lo_part = -k * log2_lo exact_lo_part.set_attributes(tag="exact_lo", debug=debug_multi, prevent_optimization=True) r = exact_hi_part + exact_lo_part r.set_tag("r") r.set_attributes(debug=debug_multi) approx_interval = Interval(-log(2) / 2, log(2) / 2) approx_interval_half = approx_interval / 2 approx_interval_split = [ Interval(-log(2) / 2, inf(approx_interval_half)), approx_interval_half, Interval(sup(approx_interval_half), log(2) / 2) ] # TODO: should be computed automatically exact_hi_interval = approx_interval exact_lo_interval = -interval_k * log2_lo opt_r = self.optimise_scheme(r, copy={}) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision=self.precision, interval=interval_vx), tag_map["k"]: Variable("k", interval=interval_k, precision=self.precision) } #try: if is_gappa_installed(): eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_r, cg_eval_error_copy_map, gappa_filename="red_arg.g") else: eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "eval error: %s" % eval_error) local_ulp = sup(ulp(sollya.exp(approx_interval), self.precision)) # FIXME refactor error_goal from accuracy Log.report(Log.Info, "accuracy: %s" % self.accuracy) if isinstance(self.accuracy, ML_Faithful): error_goal = local_ulp elif isinstance(self.accuracy, ML_CorrectlyRounded): error_goal = S2**-1 * local_ulp elif isinstance(self.accuracy, ML_DegradedAccuracyAbsolute): error_goal = self.accuracy.goal elif isinstance(self.accuracy, ML_DegradedAccuracyRelative): error_goal = self.accuracy.goal else: Log.report(Log.Error, "unknown accuracy: %s" % self.accuracy) # error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = max( sup( guessdegree( expm1(sollya.x) / sollya.x, approx_interval, error_goal_approx)) - 1, 2) init_poly_degree = poly_degree error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) precision_list = [1] + [self.precision] * (poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( expm1(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "polynomial: %s " % poly_object) sub_poly = poly_object.sub_poly(start_index=2) Log.report(Log.Info, "polynomial: %s " % sub_poly) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report( Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") pre_poly = polynomial_scheme_builder( poly_object, r, unified_precision=self.precision) pre_poly.set_attributes(tag="pre_poly", debug=debug_multi) pre_sub_poly = polynomial_scheme_builder( sub_poly, r, unified_precision=self.precision) pre_sub_poly.set_attributes(tag="pre_sub_poly", debug=debug_multi) poly = 1 + (exact_hi_part + (exact_lo_part + pre_sub_poly)) poly.set_tag("poly") # optimizing poly before evaluation error computation #opt_poly = self.opt_engine.optimization_process(poly, self.precision, fuse_fma = fuse_fma) #opt_sub_poly = self.opt_engine.optimization_process(pre_sub_poly, self.precision, fuse_fma = fuse_fma) opt_poly = self.optimise_scheme(poly) opt_sub_poly = self.optimise_scheme(pre_sub_poly) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision=self.precision, interval=approx_interval) exact_hi_gappa_var = Variable("exact_hi", precision=self.precision, interval=exact_hi_interval) exact_lo_gappa_var = Variable("exact_lo", precision=self.precision, interval=exact_lo_interval) vx_gappa_var = Variable("x", precision=self.precision, interval=interval_vx) k_gappa_var = Variable("k", interval=interval_k, precision=self.precision) #print "exact_hi interval: ", exact_hi_interval sub_poly_error_copy_map = { #r.get_handle().get_node(): r_gappa_var, #vx.get_handle().get_node(): vx_gappa_var, exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, #k.get_handle().get_node(): k_gappa_var, } poly_error_copy_map = { exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, } if is_gappa_installed(): sub_poly_eval_error = -1.0 sub_poly_eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_sub_poly, sub_poly_error_copy_map, gappa_filename="%s_gappa_sub_poly.g" % self.function_name) dichotomy_map = [ { exact_hi_part.get_handle().get_node(): approx_interval_split[0], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[1], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[2], }, ] poly_eval_error_dico = self.gappa_engine.get_eval_error_v3( self.opt_engine, opt_poly, poly_error_copy_map, gappa_filename="gappa_poly.g", dichotomy=dichotomy_map) poly_eval_error = max( [sup(abs(err)) for err in poly_eval_error_dico]) else: poly_eval_error = 0.0 sub_poly_eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "stopping autonomous degree research") # incrementing polynomial degree to counteract initial decrementation effect poly_degree += 1 break Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) Log.report(Log.Info, "sub poly evaluation error: %s" % sub_poly_eval_error) global_poly_error = None global_rel_poly_error = None for case_index in range(3): poly_error = poly_approx_error + poly_eval_error_dico[ case_index] rel_poly_error = sup( abs(poly_error / sollya.exp(approx_interval_split[case_index]))) if global_rel_poly_error == None or rel_poly_error > global_rel_poly_error: global_rel_poly_error = rel_poly_error global_poly_error = poly_error flag = error_goal > global_rel_poly_error if flag: break else: poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier=Comparison.Greater, likely=False, debug=debug_multi, tag="late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = Subtraction( ik, Constant(overflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), debug=debug_multi, tag="diff_k", ) late_overflow_result = (ExponentInsertion( diff_k, precision=self.precision) * poly) * ExponentInsertion( overflow_exp_offset, precision=self.precision) late_overflow_result.set_attributes(silent=False, tag="late_overflow_result", debug=debug_multi, precision=self.precision) late_overflow_return = ConditionBlock( Test(late_overflow_result, specifier=Test.IsInfty, likely=False), ExpRaiseReturn(ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision)), Return(late_overflow_result, precision=self.precision)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier=Comparison.LessOrEqual, likely=False) underflow_exp_offset = 2 * self.precision.get_field_size() corrected_exp = Addition( ik, Constant(underflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), tag="corrected_exp") late_underflow_result = ( ExponentInsertion(corrected_exp, precision=self.precision) * poly) * ExponentInsertion(-underflow_exp_offset, precision=self.precision) late_underflow_result.set_attributes(debug=debug_multi, tag="late_underflow_result", silent=False) test_subnormal = Test(late_underflow_result, specifier=Test.IsSubnormal) late_underflow_return = Statement( ConditionBlock( test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value=late_underflow_result)), Return(late_underflow_result, precision=self.precision)) twok = ExponentInsertion(ik, tag="exp_ik", debug=debug_multi, precision=self.precision) #std_result = twok * ((1 + exact_hi_part * pre_poly) + exact_lo_part * pre_poly) std_result = twok * poly std_result.set_attributes(tag="std_result", debug=debug_multi) result_scheme = ConditionBlock( late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result, precision=self.precision))) std_return = ConditionBlock( early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock( test_nan_or_inf, Statement(ClearException() if self.libm_compliant else Statement(), specific_return), std_return) return scheme
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) index_size = 3 vx = Abs(vx) int_precision = { ML_Binary32: ML_Int32, ML_Binary64: ML_Int64 }[self.precision] # argument reduction arg_reg_value = log(2) / 2**index_size inv_log2_value = round(1 / arg_reg_value, self.precision.get_sollya_object(), RN) inv_log2_cst = Constant(inv_log2_value, precision=self.precision, tag="inv_log2") # for r_hi to be accurate we ensure k * log2_hi_value_cst is exact # by limiting the number of non-zero bits in log2_hi_value_cst # cosh(x) ~ exp(abs(x))/2 for a big enough x # cosh(x) > 2^1023 <=> exp(x) > 2^1024 <=> x > log(2^21024) # k = inv_log2_value * x # -1 for guard max_k_approx = inv_log2_value * log(sollya.SollyaObject(2)**1024) max_k_bitsize = int(ceil(log2(max_k_approx))) Log.report(Log.Info, "max_k_bitsize: %d" % max_k_bitsize) log2_hi_value_precision = self.precision.get_precision( ) - max_k_bitsize - 1 log2_hi_value = round(arg_reg_value, log2_hi_value_precision, RN) log2_lo_value = round(arg_reg_value - log2_hi_value, self.precision.get_sollya_object(), RN) log2_hi_value_cst = Constant(log2_hi_value, tag="log2_hi_value", precision=self.precision) log2_lo_value_cst = Constant(log2_lo_value, tag="log2_lo_value", precision=self.precision) k = Trunc(Multiplication(inv_log2_cst, vx), precision=self.precision) k_log2 = Multiplication(k, log2_hi_value_cst, precision=self.precision, exact=True, tag="k_log2", unbreakable=True) r_hi = vx - k_log2 r_hi.set_attributes(tag="r_hi", debug=debug_multi, unbreakable=True) r_lo = -k * log2_lo_value_cst # reduced argument r = r_hi + r_lo r.set_attributes(tag="r", debug=debug_multi) r_eval_error = self.get_eval_error( r_hi, variable_copy_map={ vx: Variable("vx", interval=Interval(0, 715), precision=self.precision), k: Variable("k", interval=Interval(0, 1024), precision=int_precision) }) print "r_eval_error: ", r_eval_error approx_interval = Interval(-arg_reg_value, arg_reg_value) error_goal_approx = 2**-(self.precision.get_precision()) poly_degree = sup( guessdegree(exp(sollya.x), approx_interval, error_goal_approx)) precision_list = [1] + [self.precision] * (poly_degree) k_integer = Conversion(k, precision=int_precision, tag="k_integer", debug=debug_multi) k_hi = BitLogicRightShift(k_integer, Constant(index_size), tag="k_int_hi", precision=int_precision, debug=debug_multi) k_lo = Modulo(k_integer, 2**index_size, tag="k_int_lo", precision=int_precision, debug=debug_multi) pow_exp = ExponentInsertion(Conversion(k_hi, precision=int_precision), precision=self.precision, tag="pow_exp", debug=debug_multi) exp_table = ML_Table(dimensions=[2 * 2**index_size, 4], storage_precision=self.precision, tag=self.uniquify_name("exp2_table")) for i in range(2 * 2**index_size): input_value = i - 2**index_size if i >= 2**index_size else i # using SollyaObject wrapper to force evaluation by sollya # with higher precision exp_value = sollya.SollyaObject(2)**((input_value) * 2**-index_size) mexp_value = sollya.SollyaObject(2)**((-input_value) * 2**-index_size) pos_value_hi = round(exp_value, self.precision.get_sollya_object(), RN) pos_value_lo = round(exp_value - pos_value_hi, self.precision.get_sollya_object(), RN) neg_value_hi = round(mexp_value, self.precision.get_sollya_object(), RN) neg_value_lo = round(mexp_value - neg_value_hi, self.precision.get_sollya_object(), RN) exp_table[i][0] = neg_value_hi exp_table[i][1] = neg_value_lo exp_table[i][2] = pos_value_hi exp_table[i][3] = pos_value_lo # log2_value = log(2) / 2^index_size # cosh(x) = 1/2 * (exp(x) + exp(-x)) # exp(x) = exp(x - k * log2_value + k * log2_value # # r = x - k * log2_value # exp(x) = exp(r) * 2 ^ (k / 2^index_size) # # k / 2^index_size = h + l * 2^-index_size # exp(x) = exp(r) * 2^h * 2^(l *2^-index_size) # # cosh(x) = exp(r) * 2^(h-1) 2^(l *2^-index_size) + exp(-r) * 2^(-h-1) * 2^(-l *2^-index_size) # error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( exp(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) print "poly_approx_error: ", poly_approx_error, float( log2(poly_approx_error)) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly_pos = polynomial_scheme_builder( poly_object.sub_poly(start_index=1), r, unified_precision=self.precision) poly_pos.set_attributes(tag="poly_pos", debug=debug_multi) poly_neg = polynomial_scheme_builder( poly_object.sub_poly(start_index=1), -r, unified_precision=self.precision) poly_neg.set_attributes(tag="poly_neg", debug=debug_multi) table_index = Addition(k_lo, Constant(2**index_size, precision=int_precision), precision=int_precision, tag="table_index", debug=debug_multi) neg_value_load_hi = TableLoad(exp_table, table_index, 0, tag="neg_value_load_hi", debug=debug_multi) neg_value_load_lo = TableLoad(exp_table, table_index, 1, tag="neg_value_load_lo", debug=debug_multi) pos_value_load_hi = TableLoad(exp_table, table_index, 2, tag="pos_value_load_hi", debug=debug_multi) pos_value_load_lo = TableLoad(exp_table, table_index, 3, tag="pos_value_load_lo", debug=debug_multi) k_plus = Max( Subtraction(k_hi, Constant(1, precision=int_precision), precision=int_precision, tag="k_plus", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision=int_precision)) k_neg = Max( Subtraction(-k_hi, Constant(1, precision=int_precision), precision=int_precision, tag="k_neg", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision=int_precision)) pow_exp_pos = ExponentInsertion(k_plus, precision=self.precision) pow_exp_neg = ExponentInsertion(k_neg, precision=self.precision) pos_exp = ( pos_value_load_hi + (pos_value_load_hi * poly_pos + (pos_value_load_lo + pos_value_load_lo * poly_pos))) * pow_exp_pos pos_exp.set_attributes(tag="pos_exp", debug=debug_multi) neg_exp = ( neg_value_load_hi + (neg_value_load_hi * poly_neg + (neg_value_load_lo + neg_value_load_lo * poly_neg))) * pow_exp_neg neg_exp.set_attributes(tag="neg_exp", debug=debug_multi) result = Addition(pos_exp, neg_exp, precision=self.precision, tag="result", debug=debug_multi) # ov_value ov_value = round(acosh(self.precision.get_max_value()), self.precision.get_sollya_object(), RD) ov_flag = Comparison(Abs(vx), Constant(ov_value, precision=self.precision), specifier=Comparison.Greater) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = Statement( Return(Select(ov_flag, FP_PlusInfty(self.precision), result))) return scheme