def __init__(self, precision=ML_Binary64, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, fast_path_extract=True, target=GenericProcessor(), output_file="log_fixed.c", function_name="log_fixed"): # declaring CodeFunction and retrieving input variable self.function_name = function_name self.precision = precision self.processor = target func_implementation = CodeFunction(self.function_name, output_format=self.precision) vx = func_implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # debug utilities debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%\"PRIx64\"", ) debugd = ML_Debug(display_format="%d", pre_process=lambda v: "(int) %s" % v) debugld = ML_Debug(display_format="%ld") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) vx_exp = RawSignExpExtraction(vx, tag="vx_exp", precision=ML_Int32, debug=debugd) vx_exp_u = Conversion(vx_exp, precision=ML_UInt32) vx_exp_u.set_precision(ML_UInt32) tt = CountLeadingZeros(vx_exp_u) tt_u = Conversion(tt, precision=ML_UInt32) t = tt_u + vx_exp_u scheme = Statement(Return(t)) #print scheme.get_str(depth = None, display_precision = True) opt_eng = OptimizationEngine(self.processor) # fusing FMA if fuse_fma: print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) #print scheme.get_str(depth = None, display_precision = True) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=self.precision) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) print "silencing operation" opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation func_implementation.set_scheme(scheme) # check processor support opt_eng.check_processor_support(scheme) #print scheme.get_str(depth = None, display_precision = True) # factorizing fast path opt_eng.factorize_fast_path(scheme) #print scheme.get_str(depth = None, display_precision = True) cg = CCodeGenerator(self.processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = func_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("support_lib/ml_special_values.h") self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") #print self.result.get(cg) output_stream = open("%s.c" % func_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def __init__(self, precision=ML_Binary32, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, fast_path_extract=True, target=GenericProcessor(), output_file="log1pf.c", function_name="log1pf"): # declaring CodeFunction and retrieving input variable self.function_name = function_name self.precision = precision self.processor = target func_implementation = CodeFunction(self.function_name, output_format=self.precision) vx = func_implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # debug utilities debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%\"PRIx64\"", ) debugd = ML_Debug(display_format="%d", pre_process=lambda v: "(int) %s" % v) debugld = ML_Debug(display_format="%ld") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) log2_hi_value = round( log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round( log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) int_precision = ML_Int64 if self.precision is ML_Binary64 else ML_Int32 # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_Table(dimensions=[2**table_index_size, 2], storage_precision=self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in xrange(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = (1.0 + (inv_approx_table[i][0] / S2**9)) * S2**-1 value_high = round( log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round( log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) # case close to 0: ctz ctz_exp_limit = -7 ctz_cond = vx_exp < ctz_exp_limit ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit) ctz_poly_degree = sup( guessdegree( log1p(sollya.x) / sollya.x, ctz_interval, S2** -(self.precision.get_field_size() + 1))) + 1 ctz_poly_object = Polynomial.build_from_approximation( log1p(sollya.x) / sollya.x, ctz_poly_degree, [self.precision] * (ctz_poly_degree + 1), ctz_interval, sollya.absolute) print "generating polynomial evaluation scheme" ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme( ctz_poly_object, vx, unified_precision=self.precision) ctz_poly.set_attributes(tag="ctz_poly", debug=debug_lftolx) ctz_result = vx * ctz_poly neg_input = Comparison(vx, -1, likely=False, specifier=Comparison.Less, debug=debugd, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debugd, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debugd, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debugd, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debugd, tag="vx_subnormal") log_function_code = CodeFunction( "new_log", [Variable("x", precision=ML_Binary64)], output_format=ML_Binary64) log_call_generator = FunctionOperator( log_function_code.get_name(), arity=1, output_precision=ML_Binary64, declare_prototype=log_function_code) newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64, ), ML_Binary64, log_call_generator) # case away from 0.0 pre_vxp1 = vx + 1.0 pre_vxp1.set_attributes(tag="pre_vxp1", debug=debug_lftolx) pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag="pre_vxp1_exp", debug=debugd) cm500 = Constant(-500, precision=ML_Int32) c0 = Constant(0, precision=ML_Int32) cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size() - 2) scaling_factor_exp = Select(cond_scaling, cm500, c0) scaling_factor = ExponentInsertion(scaling_factor_exp, precision=self.precision, tag="scaling_factor") vxp1 = pre_vxp1 * scaling_factor vxp1.set_attributes(tag="vxp1", debug=debug_lftolx) vxp1_exp = ExponentExtraction(vxp1, tag="vxp1_exp", debug=debugd) vxp1_inv = DivisionSeed(vxp1, precision=self.precision, tag="vxp1_inv", debug=debug_lftolx, silent=True) vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision=self.precision, tag="vxp1_dirty_inv", debug=debug_lftolx) table_index = BitLogicAnd(BitLogicRightShift( TypeCast(vxp1, precision=int_precision, debug=debuglx), self.precision.get_field_size() - 7, debug=debuglx), 0x7f, tag="table_index", debug=debuglx) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision=ML_UInt64), Constant(-2, precision=ML_UInt64), precision=ML_UInt64), precision=self.precision, tag="pre_arg_red_index", debug=debug_lftolx) arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag="arg_red_index", debug=debug_lftolx) red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index) #red_vxp1 = arg_red_index * vxp1 - 1.0 red_vxp1.set_attributes(tag="red_vxp1", debug=debug_lftolx) log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_lftolx) log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_lftolx) inv_err = S2**-6 # TODO: link to target DivisionSeed precision print "building mathematical polynomial" approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index=1) print "generating polynomial evaluation scheme" _poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, red_vxp1, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_lftolx) print global_poly_object.get_sollya_object() vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag="vxp1_inv_exp", debug=debugd) corr_exp = -vxp1_exp + scaling_factor_exp # vxp1_inv_exp #poly = (red_vxp1) * (1 + _poly) #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True) pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo)) pre_result.set_attributes(tag="pre_result", debug=debug_lftolx) exact_log2_hi_exp = -corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_exp", debug=debug_lftolx, prevent_optimization=True) #std_result = exact_log2_hi_exp + pre_result exact_log2_lo_exp = -corr_exp * log2_lo exact_log2_lo_exp.set_attributes( tag="exact_log2_lo_exp", debug=debug_lftolx) #, prevent_optimization = True) init = exact_log2_lo_exp - log_inv_lo init.set_attributes(tag="init", debug=debug_lftolx, prevent_optimization=True) fma0 = (red_vxp1 * _poly + init) # - log_inv_lo) fma0.set_attributes(tag="fma0", debug=debug_lftolx) step0 = fma0 step0.set_attributes( tag="step0", debug=debug_lftolx) #, prevent_optimization = True) step1 = step0 + red_vxp1 step1.set_attributes(tag="step1", debug=debug_lftolx, prevent_optimization=True) step2 = -log_inv_hi + step1 step2.set_attributes(tag="step2", debug=debug_lftolx, prevent_optimization=True) std_result = exact_log2_hi_exp + step2 std_result.set_attributes(tag="std_result", debug=debug_lftolx, prevent_optimization=True) # main scheme print "MDL scheme" pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, Return(vx), ConditionBlock(ctz_cond, Statement(Return(ctz_result), ), Statement(Return(std_result)))))) scheme = pre_scheme #print scheme.get_str(depth = None, display_precision = True) opt_eng = OptimizationEngine(self.processor) # fusing FMA print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) #print scheme.get_str(depth = None, display_precision = True) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=ML_Binary32) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) print "silencing operation" opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation func_implementation.set_scheme(scheme) # check processor support opt_eng.check_processor_support(scheme) # factorizing fast path opt_eng.factorize_fast_path(scheme) #print scheme.get_str(depth = None, display_precision = True) cg = CCodeGenerator(self.processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = func_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("support_lib/ml_special_values.h") self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") #print self.result.get(cg) output_stream = open("%s.c" % func_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
from metalibm_core.core.ml_entity import ML_Entity, ML_EntityBasis, DefaultEntityArgTemplate from metalibm_core.code_generation.generator_utility import FunctionOperator, FO_Result, FO_Arg from metalibm_core.utility.ml_template import * from metalibm_core.utility.log_report import Log from metalibm_core.utility.debug_utils import * from metalibm_core.utility.num_utils import ulp from metalibm_core.utility.gappa_utils import is_gappa_installed from metalibm_core.core.ml_hdl_format import * from metalibm_core.core.ml_hdl_operations import * from metalibm_hw_blocks.lzc import ML_LeadingZeroCounter ## Helper for debug enabling debug_std = ML_Debug(display_format=" -radix 2 ") debug_dec = ML_Debug(display_format=" -radix 10 ") debug_dec_unsigned = ML_Debug(display_format=" -decimal -unsigned ") ## Wrapper for zero extension # @param op the input operation tree # @param s integer size of the extension # @return the Zero extended operation node def zext(op, s): s = int(s) op_size = op.get_precision().get_bit_size() ext_precision = ML_StdLogicVectorFormat(op_size + s) return ZeroExt(op, s, precision=ext_precision)
def generate_scheme(self): def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = VirtualFormat(base_format=self.precision, support_format=ML_StdLogicVectorFormat( self.precision.get_bit_size()), get_cst=get_virtual_cst) # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) # reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) vy = self.implementation.add_input_signal("y", io_precision) vx_precision = self.precision vy_precision = self.precision result_precision = self.precision # precision for first operand vx which is to be statically # positionned p = vx_precision.get_mantissa_size() # precision for second operand vy which is to be dynamically shifted q = vy_precision.get_mantissa_size() # precision of output o = result_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) exp_vy_precision = ML_StdLogicVectorFormat( vy_precision.get_exponent_size()) mant_vx_precision = ML_StdLogicVectorFormat(p - 1) mant_vy_precision = ML_StdLogicVectorFormat(q - 1) mant_vx = MantissaExtraction(vx, precision=mant_vx_precision) mant_vy = MantissaExtraction(vy, precision=mant_vy_precision) exp_vx = RawExponentExtraction(vx, precision=exp_vx_precision) exp_vy = RawExponentExtraction(vy, precision=exp_vy_precision) # Maximum number of leading zero for normalized <vx> L_x = 0 # Maximum number of leading zero for normalized <vy> L_y = 0 sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') effective_op = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="effective_op", debug=ML_Debug(display_format="-radix 2")) exp_vx_bias = vx_precision.get_bias() exp_vy_bias = vy_precision.get_bias() exp_offset = max(o + L_y, q) + 2 exp_bias = exp_offset + exp_vx_bias - exp_vy_bias # Determine a working precision to accomodate exponent difference # FIXME: check interval and exponent operations size exp_precision_ext_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size()) + 2 exp_precision_ext = ML_StdLogicVectorFormat(exp_precision_ext_size) # Y is first aligned offset = max(o+L_y,q) + 2 bits to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + offset # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction( Addition(zext( exp_vx, exp_precision_ext_size - vx_precision.get_exponent_size()), Constant(exp_bias, precision=exp_precision_ext), precision=exp_precision_ext), zext(exp_vy, exp_precision_ext_size - vy_precision.get_exponent_size()), precision=exp_precision_ext, tag="exp_diff", debug=debug_std) signed_exp_diff = SignCast(exp_diff, specifier=SignCast.Signed, precision=exp_precision_ext) datapath_full_width = exp_offset + max(o + L_x, p) + 2 + q max_exp_diff = datapath_full_width - q exp_diff_lt_0 = Comparison(signed_exp_diff, Constant(0, precision=exp_precision_ext), specifier=Comparison.Less, precision=ML_Bool, tag="exp_diff_lt_0", debug=debug_std) exp_diff_gt_max_diff = Comparison(signed_exp_diff, Constant( max_exp_diff, precision=exp_precision_ext), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(max_exp_diff)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_max_diff, Constant(max_exp_diff, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=ML_Debug(display_format="-radix 10")) mant_ext_size = max_exp_diff shift_prec = ML_StdLogicVectorFormat(datapath_full_width) shifted_mant_vy = BitLogicRightShift(rzext(mant_vy, mant_ext_size), mant_shift, precision=shift_prec, tag="shifted_mant_vy", debug=debug_std) # vx is right-extended by q+2 bits # and left extend by exp_offset mant_vx_ext = zext(rzext(mant_vx, q + 2), exp_offset + 1) add_prec = ML_StdLogicVectorFormat(datapath_full_width + 1) mant_vx_add_op = Select(Comparison(effective_op, Constant(1, precision=ML_StdLogic), precision=ML_Bool, specifier=Comparison.Equal), Negation(mant_vx_ext, precision=add_prec, tag="neg_mant_vx"), mant_vx_ext, precision=add_prec, tag="mant_vx_add_op", debug=ML_Debug(display_format=" ")) mant_add = Addition(zext(shifted_mant_vy, 1), mant_vx_add_op, precision=add_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) # if the addition overflows, then it meant vx has been negated and # the 2's complement addition cancelled the negative MSB, thus # the addition result is positive, and the result is of the sign of Y # else the result is of opposite sign to Y add_is_negative = BitLogicAnd(CopySign(mant_add, precision=ML_StdLogic), effective_op, precision=ML_StdLogic, tag="add_is_negative", debug=ML_Debug(" -radix 2")) # Negate mantissa addition result if it is negative mant_add_abs = Select(Comparison(add_is_negative, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), Negation(mant_add, precision=add_prec, tag="neg_mant_add", debug=debug_std), mant_add, precision=add_prec, tag="mant_add_abs", debug=debug_std) res_sign = BitLogicXor(add_is_negative, sign_vy, precision=ML_StdLogic, tag="res_sign") # Precision for leading zero count lzc_width = int(floor(log2(datapath_full_width + 1)) + 1) lzc_prec = ML_StdLogicVectorFormat(lzc_width) lzc_args = ML_LeadingZeroCounter.get_default_args( width=(datapath_full_width + 1)) LZC_entity = ML_LeadingZeroCounter(lzc_args) lzc_entity_list = LZC_entity.generate_scheme() lzc_implementation = LZC_entity.get_implementation() lzc_component = lzc_implementation.get_component_object() #lzc_in = SubSignalSelection(mant_add, p+1, 2*p+3) lzc_in = mant_add_abs # SubSignalSelection(mant_add_abs, 0, 3*p+3, precision = ML_StdLogicVectorFormat(3*p+4)) add_lzc = Signal("add_lzc", precision=lzc_prec, var_type=Signal.Local, debug=debug_dec) add_lzc = PlaceHolder( add_lzc, lzc_component(io_map={ "x": lzc_in, "vr_out": add_lzc })) # Index of output mantissa least significant bit mant_lsb_index = datapath_full_width - o + 1 #add_lzc = CountLeadingZeros(mant_add, precision = lzc_prec) # CP stands for close path, the data path where X and Y are within 1 exp diff res_normed_mant = BitLogicLeftShift(mant_add_abs, add_lzc, precision=add_prec, tag="res_normed_mant", debug=debug_std) pre_mant_field = SubSignalSelection( res_normed_mant, mant_lsb_index, datapath_full_width - 1, precision=ML_StdLogicVectorFormat(o - 1)) ## Helper function to extract a single bit # from a vector of bits signal def BitExtraction(optree, index, **kw): return VectorElementSelection(optree, index, precision=ML_StdLogic, **kw) def IntCst(value): return Constant(value, precision=ML_Integer) round_bit = BitExtraction(res_normed_mant, IntCst(mant_lsb_index - 1)) mant_lsb = BitExtraction(res_normed_mant, IntCst(mant_lsb_index)) sticky_prec = ML_StdLogicVectorFormat(datapath_full_width - o) sticky_input = SubSignalSelection(res_normed_mant, 0, datapath_full_width - o - 1, precision=sticky_prec) sticky_bit = Select(Comparison(sticky_input, Constant(0, precision=sticky_prec), specifier=Comparison.NotEqual, precision=ML_Bool), Constant(1, precision=ML_StdLogic), Constant(0, precision=ML_StdLogic), precision=ML_StdLogic, tag="sticky_bit", debug=debug_std) # increment selection for rouding to nearest (tie to even) round_increment_RN = BitLogicAnd(round_bit, BitLogicOr(sticky_bit, mant_lsb, precision=ML_StdLogic), precision=ML_StdLogic, tag="round_increment_RN", debug=debug_std) rounded_mant = Addition(zext(pre_mant_field, 1), round_increment_RN, precision=ML_StdLogicVectorFormat(o), tag="rounded_mant", debug=debug_std) rounded_overflow = BitExtraction(rounded_mant, IntCst(o - 1), tag="rounded_overflow", debug=debug_std) res_mant_field = Select(Comparison(rounded_overflow, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), SubSignalSelection(rounded_mant, 1, o - 1), SubSignalSelection(rounded_mant, 0, o - 2), precision=ML_StdLogicVectorFormat(o - 1), tag="final_mant", debug=debug_std) res_exp_tmp_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size()) + 2 res_exp_tmp_prec = ML_StdLogicVectorFormat(res_exp_tmp_size) exp_vy_biased = Addition(zext( exp_vy, res_exp_tmp_size - vy_precision.get_exponent_size()), Constant(vy_precision.get_bias() + 1, precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vy_biased", debug=debug_dec) # vx's exponent is biased with the format bias # plus the exponent offset so it is left align to datapath MSB exp_vx_biased = Addition( zext(exp_vx, res_exp_tmp_size - vx_precision.get_exponent_size()), Constant(vx_precision.get_bias() + exp_offset + 1, precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vx_biased", debug=debug_dec) # If exp diff is less than 0, then we must consider that vy's exponent is # the meaningful one and thus compute result exponent with respect # to vy's exponent value res_exp_base = Select(exp_diff_lt_0, exp_vy_biased, exp_vx_biased, precision=res_exp_tmp_prec, tag="res_exp_base", debug=debug_dec) # Eventually we add the result exponent base # with the exponent offset and the leading zero count res_exp_ext = Addition(Subtraction( Addition(zext(res_exp_base, 0), Constant(-result_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec), zext(add_lzc, res_exp_tmp_size - lzc_width), precision=res_exp_tmp_prec), rounded_overflow, precision=res_exp_tmp_prec, tag="res_exp_ext", debug=debug_std) res_exp_prec = ML_StdLogicVectorFormat( result_precision.get_exponent_size()) res_exp = Truncate(res_exp_ext, precision=res_exp_prec, tag="res_exp", debug=debug_dec_unsigned) vr_out = TypeCast(FloatBuild( res_sign, res_exp, res_mant_field, precision=self.precision, ), precision=io_precision, tag="result", debug=debug_std) self.implementation.add_output_signal("vr_out", vr_out) return lzc_entity_list + [self.implementation]
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ############################################################################### from metalibm_core.core.attributes import ML_Debug, ML_AdvancedDebug, ML_MultiDebug from metalibm_core.core.ml_formats import * # debug utilities # display single precision and double precision numbers debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") # display hexadecimal format for integer debugx = ML_Debug(display_format="%x") # display 64-bit hexadecimal format for integer debuglx = ML_Debug(display_format="%\"PRIx64\"", ) # display long/int integer debugd = ML_Debug(display_format="%d", pre_process=lambda v: "(int) %s" % v) # display long long/ long int integer debugld = ML_Debug(display_format="%ld")
def generate_scheme(self): ## Generate Fused multiply and add comput <x> . <y> + <z> Log.report( Log.Info, "generating fixed MPFMA with {ed} extra digit(s) and sign-magnitude accumulator: {sm}" .format(ed=self.extra_digit, sm=self.sign_magnitude)) def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = HdlVirtualFormat(self.precision) # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) # reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable # maximum weigth for a mantissa product digit max_prod_exp = self.precision.get_emax() * 2 + 1 # minimum wieght for a mantissa product digit min_prod_exp = self.precision.get_emin_subnormal() * 2 ## Most and least significant digit index for the # accumulator acc_msb_index = max_prod_exp + self.extra_digit acc_lsb_index = min_prod_exp acc_width = acc_msb_index - min_prod_exp + 1 # precision of the accumulator acc_prec = ML_StdLogicVectorFormat(acc_width) reset = self.implementation.add_input_signal("reset", ML_StdLogic) vx = self.implementation.add_input_signal("x", io_precision) vy = self.implementation.add_input_signal("y", io_precision) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() acc = self.implementation.add_input_signal("acc", acc_prec) if self.sign_magnitude: # the accumulator is in sign-magnitude representation sign_acc = self.implementation.add_input_signal( "sign_acc", ML_StdLogic) else: sign_acc = CopySign(acc, precision=ML_StdLogic, tag="sign_acc", debug=debug_std) vx_precision = self.precision vy_precision = self.precision result_precision = acc_prec # precision for first operand vx which is to be statically # positionned p = vx_precision.get_mantissa_size() # precision for second operand vy which is to be dynamically shifted q = vy_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) exp_vy_precision = ML_StdLogicVectorFormat( vy_precision.get_exponent_size()) mant_vx_precision = ML_StdLogicVectorFormat(p - 1) mant_vy_precision = ML_StdLogicVectorFormat(q - 1) mant_vx = MantissaExtraction(vx, precision=mant_vx_precision) mant_vy = MantissaExtraction(vy, precision=mant_vy_precision) exp_vx = ExponentExtraction(vx, precision=exp_vx_precision, tag="exp_vx", debug=debug_dec) exp_vy = ExponentExtraction(vy, precision=exp_vy_precision, tag="exp_vy", debug=debug_dec) # Maximum number of leading zero for normalized <vx> mantissa L_x = 0 # Maximum number of leading zero for normalized <vy> mantissa L_y = 0 # Maximum number of leading zero for the product of <x>.<y> # mantissa. L_xy = L_x + L_y + 1 sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') sign_xy = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="sign_xy", debug=ML_Debug(display_format="-radix 2")) effective_op = BitLogicXor(sign_xy, sign_acc, precision=ML_StdLogic, tag="effective_op", debug=ML_Debug(display_format="-radix 2")) exp_vx_bias = vx_precision.get_bias() exp_vy_bias = vy_precision.get_bias() # <acc> is statically positionned in the datapath, # it may even constitute the whole datapath # # the product is shifted with respect to the fix accumulator exp_bias = (exp_vx_bias + exp_vy_bias) # because of the mantissa range [1, 2[, the product exponent # is located one bit to the right (lower) of the product MSB prod_exp_offset = 1 # Determine a working precision to accomodate exponent difference # FIXME: check interval and exponent operations size exp_precision_ext_size = max( vx_precision.get_exponent_size(), vy_precision.get_exponent_size(), abs(ceil(log2(abs(acc_msb_index)))), abs(ceil(log2(abs(acc_lsb_index)))), abs(ceil(log2(abs(exp_bias + prod_exp_offset)))), ) + 2 Log.report(Log.Info, "exp_precision_ext_size={}".format(exp_precision_ext_size)) exp_precision_ext = ML_StdLogicVectorFormat(exp_precision_ext_size) # static accumulator exponent exp_acc = Constant(acc_msb_index, precision=exp_precision_ext, tag="exp_acc", debug=debug_cst_dec) # Y is first aligned offset = max(o+L_y,q) + 2 bits to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + offset # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction( exp_acc, Addition(Addition(zext( exp_vy, exp_precision_ext_size - vy_precision.get_exponent_size()), zext( exp_vx, exp_precision_ext_size - vx_precision.get_exponent_size()), precision=exp_precision_ext), Constant(exp_bias + prod_exp_offset, precision=exp_precision_ext, tag="diff_bias", debug=debug_cst_dec), precision=exp_precision_ext, tag="pre_exp_diff", debug=debug_dec), precision=exp_precision_ext, tag="exp_diff", debug=debug_dec) signed_exp_diff = SignCast(exp_diff, specifier=SignCast.Signed, precision=exp_precision_ext) datapath_full_width = acc_width # the maximum exp diff is the size of the datapath # minus the bit size of the product max_exp_diff = datapath_full_width - (p + q) exp_diff_lt_0 = Comparison(signed_exp_diff, Constant(0, precision=exp_precision_ext), specifier=Comparison.Less, precision=ML_Bool, tag="exp_diff_lt_0", debug=debug_std) exp_diff_gt_max_diff = Comparison(signed_exp_diff, Constant( max_exp_diff, precision=exp_precision_ext), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(max_exp_diff)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_max_diff, Constant(max_exp_diff, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=ML_Debug(display_format="-radix 10")) prod_prec = ML_StdLogicVectorFormat(p + q) prod = Multiplication(mant_vx, mant_vy, precision=prod_prec, tag="prod", debug=debug_std) # attempt at pipelining the operator # self.implementation.start_new_stage() mant_ext_size = datapath_full_width - (p + q) shift_prec = ML_StdLogicVectorFormat(datapath_full_width) shifted_prod = BitLogicRightShift(rzext(prod, mant_ext_size), mant_shift, precision=shift_prec, tag="shifted_prod", debug=debug_std) ## Inserting a pipeline stage after the product shifting if self.pipelined: self.implementation.start_new_stage() if self.sign_magnitude: # the accumulator is in sign-magnitude representation acc_negated = Select(Comparison(sign_xy, sign_acc, specifier=Comparison.Equal, precision=ML_Bool), acc, BitLogicNegate(acc, precision=acc_prec), precision=acc_prec) # one extra MSB bit is added to the final addition # to detect overflows add_width = acc_width + 1 add_prec = ML_StdLogicVectorFormat(add_width) # FIXME: implement with a proper compound adder mant_add_p0_ext = Addition(zext(shifted_prod, 1), zext(acc_negated, 1), precision=add_prec) mant_add_p1_ext = Addition( mant_add_p0_ext, Constant(1, precision=ML_StdLogic), precision=add_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) # discarding carry overflow bit mant_add_p0 = SubSignalSelection(mant_add_p0_ext, 0, acc_width - 1, precision=acc_prec) mant_add_p1 = SubSignalSelection(mant_add_p1_ext, 0, acc_width - 1, precision=acc_prec) mant_add_pre_sign = CopySign(mant_add_p1_ext, precision=ML_StdLogic, tag="mant_add_pre_sign", debug=debug_std) mant_add = Select(Comparison(sign_xy, sign_acc, specifier=Comparison.Equal, precision=ML_Bool), mant_add_p0, Select( Comparison(mant_add_pre_sign, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), mant_add_p1, BitLogicNegate(mant_add_p0, precision=acc_prec), precision=acc_prec, ), precision=acc_prec, tag="mant_add") # if both operands had the same sign, then # mant_add is necessarily positive and the result # sign matches the input sign # if both operands had opposite signs, then # the result sign matches the product sign # if mant_add is positive, else the accumulator sign output_sign = Select( Comparison(effective_op, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), # if the effective op is a subtraction (prod - acc) BitLogicXor(sign_acc, mant_add_pre_sign, precision=ML_StdLogic), # the effective op is an addition, thus result and # acc share sign sign_acc, precision=ML_StdLogic, tag="output_sign") if self.pipelined: self.implementation.start_new_stage() # adding output self.implementation.add_output_signal("vr_sign", output_sign) self.implementation.add_output_signal("vr_acc", mant_add) else: # 2s complement encoding of the accumulator, # the accumulator is never negated, only the producted # is negated if negative # negate shifted prod when required shifted_prod_op = Select(Comparison(sign_xy, Constant( 1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), Negation(shifted_prod, precision=shift_prec), shifted_prod, precision=shift_prec) add_prec = shift_prec # ML_StdLogicVectorFormat(datapath_full_width + 1) mant_add = Addition(shifted_prod_op, acc, precision=acc_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) if self.pipelined: self.implementation.start_new_stage() self.implementation.add_output_signal("vr_acc", mant_add) return [self.implementation]
def generate_scheme(self): def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = VirtualFormat(base_format=self.precision, support_format=ML_StdLogicVectorFormat( self.precision.get_bit_size()), get_cst=get_virtual_cst) # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) vy = self.implementation.add_input_signal("y", io_precision) p = self.precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_precision = ML_StdLogicVectorFormat( self.precision.get_exponent_size()) mant_precision = ML_StdLogicVectorFormat( self.precision.get_field_size()) mant_vx = MantissaExtraction(vx, precision=mant_precision) mant_vy = MantissaExtraction(vy, precision=mant_precision) exp_vx = ExponentExtraction(vx, precision=exp_precision) exp_vy = ExponentExtraction(vy, precision=exp_precision) sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') effective_op = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="effective_op", debug=ML_Debug(display_format="-radix 2")) ## Wrapper for zero extension # @param op the input operation tree # @param s integer size of the extension # @return the Zero extended operation node def zext(op, s): op_size = op.get_precision().get_bit_size() ext_precision = ML_StdLogicVectorFormat(op_size + s) return ZeroExt(op, s, precision=ext_precision) ## Generate the right zero extended output from @p optree def rzext(optree, ext_size): op_size = optree.get_precision().get_bit_size() ext_format = ML_StdLogicVectorFormat(ext_size) out_format = ML_StdLogicVectorFormat(op_size + ext_size) return Concatenation(optree, Constant(0, precision=ext_format), precision=out_format) exp_bias = p + 2 exp_precision_ext = ML_StdLogicVectorFormat( self.precision.get_exponent_size() + 2) # Y is first aligned p+2 bit to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + precision + 2 # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction(Addition(zext(exp_vx, 2), Constant(exp_bias, precision=exp_precision_ext), precision=exp_precision_ext), zext(exp_vy, 2), precision=exp_precision_ext, tag="exp_diff") exp_diff_lt_0 = Comparison(exp_diff, Constant(0, precision=exp_precision_ext), specifier=Comparison.Less, precision=ML_Bool) exp_diff_gt_2pp4 = Comparison(exp_diff, Constant(2 * p + 4, precision=exp_precision_ext), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(2 * p + 4)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_2pp4, Constant(2 * p + 4, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=ML_Debug(display_format="-radix 10")) mant_ext_size = 2 * p + 4 shift_prec = ML_StdLogicVectorFormat(3 * p + 4) shifted_mant_vy = BitLogicRightShift(rzext(mant_vy, mant_ext_size), mant_shift, precision=shift_prec, tag="shifted_mant_vy", debug=debug_std) mant_vx_ext = zext(rzext(mant_vx, p + 2), p + 2 + 1) add_prec = ML_StdLogicVectorFormat(3 * p + 5) mant_vx_add_op = Select(Comparison(effective_op, Constant(1, precision=ML_StdLogic), precision=ML_Bool, specifier=Comparison.Equal), Negation(mant_vx_ext, precision=add_prec, tag="neg_mant_vx"), mant_vx_ext, precision=add_prec, tag="mant_vx_add_op", debug=ML_Debug(display_format=" ")) mant_add = Addition(zext(shifted_mant_vy, 1), mant_vx_add_op, precision=add_prec, tag="mant_add", debug=ML_Debug(display_format=" -radix 2")) # if the addition overflows, then it meant vx has been negated and # the 2's complement addition cancelled the negative MSB, thus # the addition result is positive, and the result is of the sign of Y # else the result is of opposite sign to Y add_is_negative = BitLogicAnd(CopySign(mant_add, precision=ML_StdLogic), effective_op, precision=ML_StdLogic, tag="add_is_negative", debug=ML_Debug(" -radix 2")) # Negate mantissa addition result if it is negative mant_add_abs = Select(Comparison(add_is_negative, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), Negation(mant_add, precision=add_prec, tag="neg_mant_add"), mant_add, precision=add_prec, tag="mant_add_abs") res_sign = BitLogicXor(add_is_negative, sign_vy, precision=ML_StdLogic, tag="res_sign") # Precision for leading zero count lzc_width = int(floor(log2(3 * p + 5)) + 1) lzc_prec = ML_StdLogicVectorFormat(lzc_width) lzc_args = ML_LeadingZeroCounter.get_default_args(width=(3 * p + 5)) LZC_entity = ML_LeadingZeroCounter(lzc_args) lzc_entity_list = LZC_entity.generate_scheme() lzc_implementation = LZC_entity.get_implementation() lzc_component = lzc_implementation.get_component_object() #lzc_in = SubSignalSelection(mant_add, p+1, 2*p+3) lzc_in = mant_add_abs # SubSignalSelection(mant_add_abs, 0, 3*p+3, precision = ML_StdLogicVectorFormat(3*p+4)) add_lzc = Signal("add_lzc", precision=lzc_prec, var_type=Signal.Local, debug=debug_dec) add_lzc = PlaceHolder( add_lzc, lzc_component(io_map={ "x": lzc_in, "vr_out": add_lzc })) #add_lzc = CountLeadingZeros(mant_add, precision = lzc_prec) # CP stands for close path, the data path where X and Y are within 1 exp diff res_normed_mant = BitLogicLeftShift(mant_add, add_lzc, precision=add_prec, tag="res_normed_mant", debug=debug_std) pre_mant_field = SubSignalSelection( res_normed_mant, 2 * p + 5, 3 * p + 3, precision=ML_StdLogicVectorFormat(p - 1)) ## Helper function to extract a single bit # from a vector of bits signal def BitExtraction(optree, index, **kw): return VectorElementSelection(optree, index, precision=ML_StdLogic, **kw) def IntCst(value): return Constant(value, precision=ML_Integer) round_bit = BitExtraction(res_normed_mant, IntCst(2 * p + 4)) mant_lsb = BitExtraction(res_normed_mant, IntCst(2 * p + 5)) sticky_prec = ML_StdLogicVectorFormat(2 * p + 4) sticky_input = SubSignalSelection(res_normed_mant, 0, 2 * p + 3, precision=sticky_prec) sticky_bit = Select(Comparison(sticky_input, Constant(0, precision=sticky_prec), specifier=Comparison.NotEqual, precision=ML_Bool), Constant(1, precision=ML_StdLogic), Constant(0, precision=ML_StdLogic), precision=ML_StdLogic, tag="sticky_bit", debug=debug_std) # increment selection for rouding to nearest (tie to even) round_increment_RN = BitLogicAnd(round_bit, BitLogicOr(sticky_bit, mant_lsb, precision=ML_StdLogic), precision=ML_StdLogic, tag="round_increment_RN", debug=debug_std) rounded_mant = Addition(zext(pre_mant_field, 1), round_increment_RN, precision=ML_StdLogicVectorFormat(p), tag="rounded_mant", debug=debug_std) rounded_overflow = BitExtraction(rounded_mant, IntCst(p - 1), tag="rounded_overflow", debug=debug_std) res_mant_field = Select(Comparison(rounded_overflow, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), SubSignalSelection(rounded_mant, 1, p - 1), SubSignalSelection(rounded_mant, 0, p - 2), precision=ML_StdLogicVectorFormat(p - 1), tag="final_mant", debug=debug_std) res_exp_prec_size = self.precision.get_exponent_size() + 2 res_exp_prec = ML_StdLogicVectorFormat(res_exp_prec_size) res_exp_ext = Addition(Subtraction( Addition(zext(exp_vx, 2), Constant(3 + p, precision=res_exp_prec), precision=res_exp_prec), zext(add_lzc, res_exp_prec_size - lzc_width), precision=res_exp_prec), rounded_overflow, precision=res_exp_prec, tag="res_exp_ext", debug=debug_std) res_exp = Truncate(res_exp_ext, precision=ML_StdLogicVectorFormat( self.precision.get_exponent_size()), tag="res_exp", debug=debug_dec) vr_out = TypeCast(FloatBuild( res_sign, res_exp, res_mant_field, precision=self.precision, ), precision=io_precision, tag="result", debug=debug_std) self.implementation.add_output_signal("vr_out", vr_out) return lzc_entity_list + [self.implementation]
def generate_scheme(self): ## Generate Fused multiply and add comput <x> . <y> + <z> Log.report( Log.Info, "generating MPFMA with acc precision {acc_precision} and precision {precision}" .format(acc_precision=self.acc_precision, precision=self.precision)) def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision prod_input_precision = VirtualFormat( base_format=self.precision, support_format=ML_StdLogicVectorFormat( self.precision.get_bit_size()), get_cst=get_virtual_cst) accumulator_precision = VirtualFormat( base_format=self.acc_precision, support_format=ML_StdLogicVectorFormat( self.acc_precision.get_bit_size()), get_cst=get_virtual_cst) # declaring standard clock and reset input signal #clk = self.implementation.add_input_signal("clk", ML_StdLogic) # reset = self.implementation.add_input_signal("reset", ML_StdLogic) # declaring main input variable vx = self.implementation.add_input_signal("x", prod_input_precision) vy = self.implementation.add_input_signal("y", prod_input_precision) vz = self.implementation.add_input_signal("z", accumulator_precision) # extra reset input port reset = self.implementation.add_input_signal("reset", ML_StdLogic) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() vx_precision = self.precision vy_precision = self.precision vz_precision = self.acc_precision result_precision = self.acc_precision # precision for first operand vx which is to be statically # positionned p = vx_precision.get_mantissa_size() # precision for second operand vy which is to be dynamically shifted q = vy_precision.get_mantissa_size() # precision for r = vz_precision.get_mantissa_size() # precision of output o = result_precision.get_mantissa_size() # vx must be aligned with vy # the largest shit amount (in absolute value) is precision + 2 # (1 guard bit and 1 rounding bit) exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) exp_vy_precision = ML_StdLogicVectorFormat( vy_precision.get_exponent_size()) exp_vz_precision = ML_StdLogicVectorFormat( vz_precision.get_exponent_size()) # MantissaExtraction performs the implicit # digit computation and concatenation mant_vx_precision = ML_StdLogicVectorFormat(p) mant_vy_precision = ML_StdLogicVectorFormat(q) mant_vz_precision = ML_StdLogicVectorFormat(r) mant_vx = MantissaExtraction(vx, precision=mant_vx_precision) mant_vy = MantissaExtraction(vy, precision=mant_vy_precision) mant_vz = MantissaExtraction(vz, precision=mant_vz_precision) exp_vx = ExponentExtraction(vx, precision=exp_vx_precision) exp_vy = ExponentExtraction(vy, precision=exp_vy_precision) exp_vz = ExponentExtraction(vz, precision=exp_vz_precision) # Maximum number of leading zero for normalized <vx> mantissa L_x = 0 # Maximum number of leading zero for normalized <vy> mantissa L_y = 0 # Maximum number of leading zero for normalized <vz> mantissa L_z = 0 # Maximum number of leading zero for the product of <x>.<y> # mantissa. L_xy = L_x + L_y + 1 sign_vx = CopySign(vx, precision=ML_StdLogic) sign_vy = CopySign(vy, precision=ML_StdLogic) sign_vz = CopySign(vz, precision=ML_StdLogic) # determining if the operation is an addition (effective_op = '0') # or a subtraction (effective_op = '1') sign_xy = BitLogicXor(sign_vx, sign_vy, precision=ML_StdLogic, tag="sign_xy", debug=ML_Debug(display_format="-radix 2")) effective_op = BitLogicXor(sign_xy, sign_vz, precision=ML_StdLogic, tag="effective_op", debug=ML_Debug(display_format="-radix 2")) exp_vx_bias = vx_precision.get_bias() exp_vy_bias = vy_precision.get_bias() exp_vz_bias = vz_precision.get_bias() # x.y is statically positionned in the datapath # while z is shifted # This is justified by the fact that z alignment may be performed # in parallel with the multiplication of x and y mantissas # The product is positionned <exp_offset>-bit to the right of datapath MSB # (without including an extra carry bit) exp_offset = max(o + L_z, r) + 2 exp_bias = exp_offset + (exp_vx_bias + exp_vy_bias) - exp_vz_bias # because of the mantissa range [1, 2[, the product exponent # is located one bit to the right (lower) of the product MSB prod_exp_offset = 1 # Determine a working precision to accomodate exponent difference # FIXME: check interval and exponent operations size exp_precision_ext_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size(), vz_precision.get_exponent_size()) + 2 exp_precision_ext = ML_StdLogicVectorFormat(exp_precision_ext_size) # Y is first aligned offset = max(o+L_y,q) + 2 bits to the left of x # and then shifted right by # exp_diff = exp_x - exp_y + offset # exp_vx in [emin, emax] # exp_vx - exp_vx + p +2 in [emin-emax + p + 2, emax - emin + p + 2] exp_diff = Subtraction(Addition(Addition( zext(exp_vy, exp_precision_ext_size - vy_precision.get_exponent_size()), zext(exp_vx, exp_precision_ext_size - vx_precision.get_exponent_size()), precision=exp_precision_ext), Constant(exp_bias + prod_exp_offset, precision=exp_precision_ext), precision=exp_precision_ext), zext( exp_vz, exp_precision_ext_size - vz_precision.get_exponent_size()), precision=exp_precision_ext, tag="exp_diff", debug=debug_std) signed_exp_diff = SignCast(exp_diff, specifier=SignCast.Signed, precision=exp_precision_ext) datapath_full_width = exp_offset + max(o + L_xy, p + q) + 2 + r max_exp_diff = datapath_full_width - r exp_diff_lt_0 = Comparison(signed_exp_diff, Constant(0, precision=exp_precision_ext), specifier=Comparison.Less, precision=ML_Bool, tag="exp_diff_lt_0", debug=debug_std) exp_diff_gt_max_diff = Comparison(signed_exp_diff, Constant( max_exp_diff, precision=exp_precision_ext), specifier=Comparison.Greater, precision=ML_Bool) shift_amount_prec = ML_StdLogicVectorFormat( int(floor(log2(max_exp_diff)) + 1)) mant_shift = Select(exp_diff_lt_0, Constant(0, precision=shift_amount_prec), Select(exp_diff_gt_max_diff, Constant(max_exp_diff, precision=shift_amount_prec), Truncate(exp_diff, precision=shift_amount_prec), precision=shift_amount_prec), precision=shift_amount_prec, tag="mant_shift", debug=ML_Debug(display_format="-radix 10")) prod_prec = ML_StdLogicVectorFormat(p + q) prod = Multiplication(mant_vx, mant_vy, precision=prod_prec, tag="prod", debug=debug_std) mant_ext_size = max_exp_diff print("mant_ext_size: %d" % max_exp_diff) print("datapath_full_width: %d" % datapath_full_width) shift_prec = ML_StdLogicVectorFormat(datapath_full_width) mant_vz_ext = rzext(mant_vz, mant_ext_size) shifted_mant_vz = BitLogicRightShift(mant_vz_ext, mant_shift, precision=shift_prec, tag="shifted_mant_vz", debug=debug_std) # Inserting pipeline stage # after production computation # and addend alignment shift if self.pipelined: self.implementation.start_new_stage() # vx is right-extended by q+2 bits # and left extend by exp_offset prod_ext = zext(rzext(prod, r + 2), exp_offset + 1) add_prec = ML_StdLogicVectorFormat(datapath_full_width + 1) ## Here we make the supposition that # the product is slower to compute than # aligning <vz> and negating it if necessary # which means that mant_add as the same sign as the product #prod_add_op = Select( # Comparison( # effective_op, # Constant(1, precision = ML_StdLogic), # precision = ML_Bool, # specifier = Comparison.Equal # ), # Negation(prod_ext, precision = add_prec, tag = "neg_prod"), # prod_ext, # precision = add_prec, # tag = "prod_add_op", # debug = ML_Debug(display_format = " ") #) addend_op = Select(Comparison(effective_op, Constant(1, precision=ML_StdLogic), precision=ML_Bool, specifier=Comparison.Equal), BitLogicNegate(zext(shifted_mant_vz, 1), precision=add_prec, tag="neg_addend_Op"), zext(shifted_mant_vz, 1), precision=add_prec, tag="addend_op", debug=debug_std) prod_add_op = prod_ext # Compound Addition mant_add_p1 = Addition(Addition(addend_op, prod_add_op, precision=add_prec), Constant(1, precision=ML_StdLogic), precision=add_prec, tag="mant_add_p1", debug=ML_Debug(display_format=" -radix 2")) mant_add_p0 = Addition(addend_op, prod_add_op, precision=add_prec, tag="mant_add_p0", debug=ML_Debug(display_format=" -radix 2")) # if the addition overflows, then it meant vx has been negated and # the 2's complement addition cancelled the negative MSB, thus # the addition result is positive, and the result is of the sign of Y # else the result is of opposite sign to Y add_is_negative = BitLogicAnd(CopySign(mant_add_p1, precision=ML_StdLogic), effective_op, precision=ML_StdLogic, tag="add_is_negative", debug=ML_Debug(" -radix 2")) # Negate mantissa addition result if it is negative mant_add_abs = Select(Comparison(add_is_negative, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), BitLogicNegate(mant_add_p0, precision=add_prec, tag="neg_mant_add_p0", debug=debug_std), mant_add_p1, precision=add_prec, tag="mant_add_abs", debug=debug_std) # determining result sign, mant_add # as the same sign as the product res_sign = BitLogicXor(add_is_negative, sign_xy, precision=ML_StdLogic, tag="res_sign") print("pre lzc stage: %d " % self.implementation.get_current_stage()) # adding pipeline stage after addition computation if self.pipelined: self.implementation.start_new_stage() print("lzc stage: %d " % self.implementation.get_current_stage()) # Precision for leading zero count lzc_width = int(floor(log2(datapath_full_width + 1)) + 1) lzc_prec = ML_StdLogicVectorFormat(lzc_width) current_stage = self.implementation.get_current_stage() print("saving current_stage: %d" % current_stage) lzc_args = ML_LeadingZeroCounter.get_default_args( width=(datapath_full_width + 1)) LZC_entity = ML_LeadingZeroCounter(lzc_args) lzc_entity_list = LZC_entity.generate_scheme() lzc_implementation = LZC_entity.get_implementation() lzc_component = lzc_implementation.get_component_object() #self.implementation.set_current_stage(current_stage) # Attributes dynamic field (init_stage and init_op) # constructors must be initialized back after # building a sub-operator inside this operator self.implementation.instanciate_dyn_attributes() # lzc_in = mant_add_abs add_lzc_sig = Signal("add_lzc", precision=lzc_prec, var_type=Signal.Local, debug=debug_dec) add_lzc = PlaceHolder(add_lzc_sig, lzc_component(io_map={ "x": mant_add_abs, "vr_out": add_lzc_sig }, tag="lzc_i"), tag="place_holder") # adding pipeline stage after leading zero count if self.pipelined: self.implementation.start_new_stage() # Index of output mantissa least significant bit mant_lsb_index = datapath_full_width - o + 1 #add_lzc = CountLeadingZeros(mant_add, precision = lzc_prec) # CP stands for close path, the data path where X and Y are within 1 exp diff res_normed_mant = BitLogicLeftShift(mant_add_abs, add_lzc, precision=add_prec, tag="res_normed_mant", debug=debug_std) pre_mant_field = SubSignalSelection( res_normed_mant, mant_lsb_index, datapath_full_width - 1, precision=ML_StdLogicVectorFormat(o - 1)) ## Helper function to extract a single bit # from a vector of bits signal def BitExtraction(optree, index, **kw): return VectorElementSelection(optree, index, precision=ML_StdLogic, **kw) def IntCst(value): return Constant(value, precision=ML_Integer) # adding pipeline stage after normalization shift if self.pipelined: self.implementation.start_new_stage() round_bit = BitExtraction(res_normed_mant, IntCst(mant_lsb_index - 1)) mant_lsb = BitExtraction(res_normed_mant, IntCst(mant_lsb_index)) sticky_prec = ML_StdLogicVectorFormat(datapath_full_width - o) sticky_input = SubSignalSelection(res_normed_mant, 0, datapath_full_width - o - 1, precision=sticky_prec) sticky_bit = Select(Comparison(sticky_input, Constant(0, precision=sticky_prec), specifier=Comparison.NotEqual, precision=ML_Bool), Constant(1, precision=ML_StdLogic), Constant(0, precision=ML_StdLogic), precision=ML_StdLogic, tag="sticky_bit", debug=debug_std) # increment selection for rouding to nearest (tie to even) round_increment_RN = BitLogicAnd(round_bit, BitLogicOr(sticky_bit, mant_lsb, precision=ML_StdLogic), precision=ML_StdLogic, tag="round_increment_RN", debug=debug_std) rounded_mant = Addition(zext(pre_mant_field, 1), round_increment_RN, precision=ML_StdLogicVectorFormat(o), tag="rounded_mant", debug=debug_std) rounded_overflow = BitExtraction(rounded_mant, IntCst(o - 1), tag="rounded_overflow", debug=debug_std) res_mant_field = Select(Comparison(rounded_overflow, Constant(1, precision=ML_StdLogic), specifier=Comparison.Equal, precision=ML_Bool), SubSignalSelection(rounded_mant, 1, o - 1), SubSignalSelection(rounded_mant, 0, o - 2), precision=ML_StdLogicVectorFormat(o - 1), tag="final_mant", debug=debug_std) res_exp_tmp_size = max(vx_precision.get_exponent_size(), vy_precision.get_exponent_size(), vz_precision.get_exponent_size()) + 2 res_exp_tmp_prec = ML_StdLogicVectorFormat(res_exp_tmp_size) # Product biased exponent # is computed from both x and y exponent exp_xy_biased = Addition(Addition( Addition(zext(exp_vy, res_exp_tmp_size - vy_precision.get_exponent_size()), Constant(vy_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vy_biased", debug=debug_dec), Addition(zext(exp_vx, res_exp_tmp_size - vx_precision.get_exponent_size()), Constant(vx_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vx_biased", debug=debug_dec), precision=res_exp_tmp_prec), Constant( exp_offset + 1, precision=res_exp_tmp_prec, ), precision=res_exp_tmp_prec, tag="exp_xy_biased", debug=debug_dec) # vz's exponent is biased with the format bias # plus the exponent offset so it is left align to datapath MSB exp_vz_biased = Addition( zext(exp_vz, res_exp_tmp_size - vz_precision.get_exponent_size()), Constant( vz_precision.get_bias() + 1, # + exp_offset + 1, precision=res_exp_tmp_prec), precision=res_exp_tmp_prec, tag="exp_vz_biased", debug=debug_dec) # If exp diff is less than 0, then we must consider that vz's exponent is # the meaningful one and thus compute result exponent with respect # to vz's exponent value res_exp_base = Select(exp_diff_lt_0, exp_vz_biased, exp_xy_biased, precision=res_exp_tmp_prec, tag="res_exp_base", debug=debug_dec) # Eventually we add the result exponent base # with the exponent offset and the leading zero count res_exp_ext = Addition(Subtraction( Addition(zext(res_exp_base, 0), Constant(-result_precision.get_bias(), precision=res_exp_tmp_prec), precision=res_exp_tmp_prec), zext(add_lzc, res_exp_tmp_size - lzc_width), precision=res_exp_tmp_prec), rounded_overflow, precision=res_exp_tmp_prec, tag="res_exp_ext", debug=debug_std) res_exp_prec = ML_StdLogicVectorFormat( result_precision.get_exponent_size()) res_exp = Truncate(res_exp_ext, precision=res_exp_prec, tag="res_exp", debug=debug_dec_unsigned) vr_out = TypeCast(FloatBuild( res_sign, res_exp, res_mant_field, precision=accumulator_precision, ), precision=accumulator_precision, tag="result", debug=debug_std) # adding pipeline stage after rouding if self.pipelined: self.implementation.start_new_stage() self.implementation.add_output_signal("vr_out", vr_out) return lzc_entity_list + [self.implementation]
from metalibm_core.core.polynomials import * from metalibm_core.core.ml_entity import ML_Entity, ML_EntityBasis, DefaultEntityArgTemplate from metalibm_core.code_generation.generator_utility import FunctionOperator, FO_Result, FO_Arg from metalibm_core.utility.ml_template import * from metalibm_core.utility.log_report import Log from metalibm_core.utility.debug_utils import * from metalibm_core.utility.num_utils import ulp from metalibm_core.utility.gappa_utils import is_gappa_installed from metalibm_core.core.ml_hdl_format import * from metalibm_core.core.ml_hdl_operations import * from metalibm_hw_blocks.lzc import ML_LeadingZeroCounter debug_std = ML_Debug(display_format=" -radix 2 ") debug_dec = ML_Debug(display_format=" -radix 10 ") class FP_Adder(ML_Entity("fp_adder")): def __init__( self, arg_template=DefaultEntityArgTemplate, precision=ML_Binary32, libm_compliant=True, debug_flag=False, target=VHDLBackend(), output_file="fp_adder.vhd", entity_name="fp_adder", language=VHDL_Code, ):
def __init__(self, precision=ML_Binary32, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, num_iter=3, fast_path_extract=True, target=GenericProcessor(), output_file="__divsf3.c", function_name="__divsf3"): # declaring CodeFunction and retrieving input variable self.precision = precision self.function_name = function_name exp_implementation = CodeFunction(self.function_name, output_format=precision) vx = exp_implementation.add_input_variable("x", precision) vy = exp_implementation.add_input_variable("y", precision) processor = target class NR_Iteration(object): def __init__(self, approx, divisor, force_fma=False): self.approx = approx self.divisor = divisor self.force_fma = force_fma if force_fma: self.error = FusedMultiplyAdd( divisor, approx, 1.0, specifier=FusedMultiplyAdd.SubtractNegate) self.new_approx = FusedMultiplyAdd( self.error, self.approx, self.approx, specifier=FusedMultiplyAdd.Standard) else: self.error = 1 - divisor * approx self.new_approx = self.approx + self.error * self.approx def get_new_approx(self): return self.new_approx def get_hint_rules(self, gcg, gappa_code, exact): divisor = self.divisor.get_handle().get_node() approx = self.approx.get_handle().get_node() new_approx = self.new_approx.get_handle().get_node() Attributes.set_default_precision(ML_Exact) if self.force_fma: rule0 = FusedMultiplyAdd( divisor, approx, 1.0, specifier=FusedMultiplyAdd.SubtractNegate) else: rule0 = 1.0 - divisor * approx rule1 = 1.0 - divisor * (approx - exact) - 1.0 rule2 = new_approx - exact subrule = approx * (2 - divisor * approx) rule3 = (new_approx - subrule ) - (approx - exact) * (approx - exact) * divisor if self.force_fma: new_error = FusedMultiplyAdd( divisor, approx, 1.0, specifier=FusedMultiplyAdd.SubtractNegate) rule4 = FusedMultiplyAdd(new_error, approx, approx) else: rule4 = approx + (1 - divisor * approx) * approx Attributes.unset_default_precision() # registering hints gcg.add_hint(gappa_code, rule0, rule1) gcg.add_hint(gappa_code, rule2, rule3) gcg.add_hint(gappa_code, subrule, rule4) debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%lx") debugd = ML_Debug(display_format="%d") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) ex = Max(Min(ExponentExtraction(vx), 1020), -1020, tag="ex", debug=debugd) ey = Max(Min(ExponentExtraction(vy), 1020), -1020, tag="ey", debug=debugd) exact_ex = ExponentExtraction(vx, tag="exact_ex") exact_ey = ExponentExtraction(vy, tag="exact_ey") Attributes.set_default_rounding_mode(ML_RoundToNearest) Attributes.set_default_silent(True) # computing the inverse square root init_approx = None scaling_factor_x = ExponentInsertion(-ex, tag="sfx_ei") scaling_factor_y = ExponentInsertion(-ey, tag="sfy_ei") scaled_vx = vx * scaling_factor_x scaled_vy = vy * scaling_factor_y scaled_vx.set_attributes(debug=debug_lftolx, tag="scaled_vx") scaled_vy.set_attributes(debug=debug_lftolx, tag="scaled_vy") scaled_vx.set_precision(ML_Binary64) scaled_vy.set_precision(ML_Binary64) # forcing vx precision to make processor support test init_approx_precision = DivisionSeed(scaled_vx, scaled_vy, precision=self.precision, tag="seed", debug=debug_lftolx) if not processor.is_supported_operation(init_approx_precision): if self.precision != ML_Binary32: px = Conversion( scaled_vx, precision=ML_Binary32, tag="px", debug=debugf) if self.precision != ML_Binary32 else vx py = Conversion( scaled_vy, precision=ML_Binary32, tag="py", debug=debugf) if self.precision != ML_Binary32 else vy init_approx_fp32 = Conversion(DivisionSeed( px, py, precision=ML_Binary32, tag="seed", debug=debugf), precision=self.precision, tag="seed_ext", debug=debug_lftolx) if not processor.is_supported_operation(init_approx_fp32): Log.report( Log.Error, "The target %s does not implement inverse square root seed" % processor) else: init_approx = init_approx_fp32 else: Log.report( Log.Error, "The target %s does not implement inverse square root seed" % processor) else: init_approx = init_approx_precision current_approx_std = init_approx # correctly-rounded inverse computation num_iteration = num_iter Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() def compute_div(_init_approx, _vx=None, _vy=None, scale_result=None): inv_iteration_list = [] Attributes.set_default_rounding_mode(ML_RoundToNearest) Attributes.set_default_silent(True) _current_approx = _init_approx for i in range(num_iteration): new_iteration = NR_Iteration( _current_approx, _vy, force_fma=False if (i != num_iteration - 1) else True) inv_iteration_list.append(new_iteration) _current_approx = new_iteration.get_new_approx() _current_approx.set_attributes(tag="iter_%d" % i, debug=debug_lftolx) def dividend_mult(div_approx, inv_approx, dividend, divisor, index, force_fma=False): #yerr = dividend - div_approx * divisor yerr = FMSN(div_approx, divisor, dividend) yerr.set_attributes(tag="yerr%d" % index, debug=debug_lftolx) #new_div = div_approx + yerr * inv_approx new_div = FMA(yerr, inv_approx, div_approx) new_div.set_attributes(tag="new_div%d" % index, debug=debug_lftolx) return new_div # multiplication correction iteration # to get correctly rounded full division _current_approx.set_attributes(tag="final_approx", debug=debug_lftolx) current_div_approx = _vx * _current_approx num_dividend_mult_iteration = 1 for i in range(num_dividend_mult_iteration): current_div_approx = dividend_mult(current_div_approx, _current_approx, _vx, _vy, i) # last iteration yerr_last = FMSN(current_div_approx, _vy, _vx) #, clearprevious = True) Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() last_div_approx = FMA(yerr_last, _current_approx, current_div_approx, rounding_mode=ML_GlobalRoundMode) yerr_last.set_attributes(tag="yerr_last", debug=debug_lftolx) pre_result = last_div_approx pre_result.set_attributes(tag="unscaled_div_result", debug=debug_lftolx) if scale_result != None: #result = pre_result * ExponentInsertion(ex) * ExponentInsertion(-ey) scale_factor_0 = Max(Min(scale_result, 950), -950, tag="scale_factor_0", debug=debugd) scale_factor_1 = Max(Min(scale_result - scale_factor_0, 950), -950, tag="scale_factor_1", debug=debugd) scale_factor_2 = scale_result - (scale_factor_1 + scale_factor_0) scale_factor_2.set_attributes(debug=debugd, tag="scale_factor_2") result = ((pre_result * ExponentInsertion(scale_factor_0)) * ExponentInsertion(scale_factor_1) ) * ExponentInsertion(scale_factor_2) else: result = pre_result result.set_attributes(tag="result", debug=debug_lftolx) ext_pre_result = FMA(yerr_last, _current_approx, current_div_approx, precision=ML_DoubleDouble, tag="ext_pre_result", debug=debug_ddtolx) subnormal_pre_result = SpecificOperation( ext_pre_result, ex - ey, precision=self.precision, specifier=SpecificOperation.Subnormalize, tag="subnormal_pre_result", debug=debug_lftolx) sub_scale_factor = ex - ey sub_scale_factor_0 = Max(Min(sub_scale_factor, 950), -950, tag="sub_scale_factor_0", debug=debugd) sub_scale_factor_1 = Max(Min(sub_scale_factor - sub_scale_factor_0, 950), -950, tag="sub_scale_factor_1", debug=debugd) sub_scale_factor_2 = sub_scale_factor - (sub_scale_factor_1 + sub_scale_factor_0) sub_scale_factor_2.set_attributes(debug=debugd, tag="sub_scale_factor_2") #subnormal_result = (subnormal_pre_result * ExponentInsertion(ex, tag ="sr_ex_ei")) * ExponentInsertion(-ey, tag = "sr_ey_ei") subnormal_result = ( subnormal_pre_result * ExponentInsertion(sub_scale_factor_0)) * ExponentInsertion( sub_scale_factor_1, tag="sr_ey_ei") * ExponentInsertion(sub_scale_factor_2) subnormal_result.set_attributes(debug=debug_lftolx, tag="subnormal_result") return result, subnormal_result, _current_approx, inv_iteration_list def bit_match(fp_optree, bit_id, likely=False, **kwords): return NotEqual(BitLogicAnd( TypeCast(fp_optree, precision=ML_Int64), 1 << bit_id), 0, likely=likely, **kwords) def extract_and_inject_sign(sign_source, sign_dest, int_precision=ML_Int64, fp_precision=self.precision, **kwords): int_sign_dest = sign_dest if isinstance( sign_dest.get_precision(), ML_Fixed_Format) else TypeCast( sign_dest, precision=int_precision) return TypeCast(BitLogicOr( BitLogicAnd(TypeCast(sign_source, precision=int_precision), 1 << (self.precision.bit_size - 1)), int_sign_dest), precision=fp_precision) x_zero = Test(vx, specifier=Test.IsZero, likely=False) y_zero = Test(vy, specifier=Test.IsZero, likely=False) comp_sign = Test(vx, vy, specifier=Test.CompSign, tag="comp_sign", debug=debuglx) y_nan = Test(vy, specifier=Test.IsNaN, likely=False) x_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False) y_snan = Test(vy, specifier=Test.IsSignalingNaN, likely=False) x_inf = Test(vx, specifier=Test.IsInfty, likely=False, tag="x_inf") y_inf = Test(vy, specifier=Test.IsInfty, likely=False, tag="y_inf", debug=debugd) scheme = None gappa_vx, gappa_vy = None, None gappa_init_approx = None gappa_current_approx = None if isinstance(processor, K1B_Processor): print "K1B specific generation" gappa_vx = vx gappa_vy = vy fast_init_approx = DivisionSeed(vx, vy, precision=self.precision, tag="fast_init_approx", debug=debug_lftolx) slow_init_approx = DivisionSeed(scaled_vx, scaled_vy, precision=self.precision, tag="slow_init_approx", debug=debug_lftolx) gappa_init_approx = fast_init_approx specific_case = bit_match(fast_init_approx, 0, tag="b0_specific_case_bit", debug=debugd) y_subnormal_or_zero = bit_match(fast_init_approx, 1, tag="b1_y_sub_or_zero", debug=debugd) x_subnormal_or_zero = bit_match(fast_init_approx, 2, tag="b2_x_sub_or_zero", debug=debugd) y_inf_or_nan = bit_match(fast_init_approx, 3, tag="b3_y_inf_or_nan", debug=debugd) inv_underflow = bit_match(fast_init_approx, 4, tag="b4_inv_underflow", debug=debugd) x_inf_or_nan = bit_match(fast_init_approx, 5, tag="b5_x_inf_or_nan", debug=debugd) mult_error_underflow = bit_match(fast_init_approx, 6, tag="b6_mult_error_underflow", debug=debugd) mult_dividend_underflow = bit_match( fast_init_approx, 7, tag="b7_mult_dividend_underflow", debug=debugd) mult_dividend_overflow = bit_match(fast_init_approx, 8, tag="b8_mult_dividend_overflow", debug=debugd) direct_result_flag = bit_match(fast_init_approx, 9, tag="b9_direct_result_flag", debug=debugd) div_overflow = bit_match(fast_init_approx, 10, tag="b10_div_overflow", debug=debugd) # bit11/eb large = bit_match(fast_init_approx, 11) # bit12 = bit_match(fast_init_approx, 11) #slow_result, slow_result_subnormal, _, _ = compute_div(slow_init_approx, scaled_vx, scaled_vy, scale_result = (ExponentInsertion(ex, tag = "eiy_sr"), ExponentInsertion(-ey, tag ="eiy_sr"))) slow_result, slow_result_subnormal, _, _ = compute_div( slow_init_approx, scaled_vx, scaled_vy, scale_result=ex - ey) fast_result, fast_result_subnormal, fast_current_approx, inv_iteration_list = compute_div( fast_init_approx, vx, vy, scale_result=None) gappa_current_approx = fast_current_approx pre_scheme = ConditionBlock( NotEqual(specific_case, 0, tag="specific_case", likely=True, debug=debugd), Return(fast_result), ConditionBlock( Equal(direct_result_flag, 0, tag="direct_result_case"), Return(fast_init_approx), ConditionBlock( x_subnormal_or_zero | y_subnormal_or_zero | inv_underflow | mult_error_underflow | mult_dividend_overflow | mult_dividend_underflow, ConditionBlock( x_zero | y_zero, Return(fast_init_approx), ConditionBlock( Test(slow_result, specifier=Test.IsSubnormal), Return(slow_result_subnormal), Return(slow_result)), ), ConditionBlock( x_inf_or_nan, Return(fast_init_approx), ConditionBlock( y_inf_or_nan, Return(fast_init_approx), ConditionBlock( NotEqual(div_overflow, 0, tag="div_overflow_case"), Return( RoundedSignedOverflow( fast_init_approx, tag="signed_inf")), #Return(extract_and_inject_sign(fast_init_approx, FP_PlusInfty(self.precision) , tag = "signed_inf")), Return(FP_SNaN(self.precision)))))))) scheme = Statement(fast_result, pre_scheme) else: print "generic generation" x_inf_or_nan = Test(vx, specifier=Test.IsInfOrNaN, likely=False) y_inf_or_nan = Test(vy, specifier=Test.IsInfOrNaN, likely=False, tag="y_inf_or_nan", debug=debugd) result, subnormal_result, gappa_current_approx, inv_iteration_list = compute_div( current_approx_std, scaled_vx, scaled_vy, scale_result=(ExponentInsertion(ex), ExponentInsertion(-ey))) gappa_vx = scaled_vx gappa_vy = scaled_vy gappa_init_approx = init_approx # x inf and y inf pre_scheme = ConditionBlock( x_inf_or_nan, ConditionBlock( x_inf, ConditionBlock( y_inf_or_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)), ), ConditionBlock(comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)))), Statement(ConditionBlock(x_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( x_zero, ConditionBlock( y_zero | y_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision))), Return(vx)), ConditionBlock( y_inf_or_nan, ConditionBlock( y_inf, Return( Select(comp_sign, FP_MinusZero(self.precision), FP_PlusZero(self.precision))), Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( y_zero, Statement( Raise(ML_FPE_DivideByZero), ConditionBlock( comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)))), ConditionBlock( Test(result, specifier=Test.IsSubnormal, likely=False), Statement( ConditionBlock( Comparison( yerr_last, 0, specifier=Comparison.NotEqual, likely=True), Statement( Raise(ML_FPE_Inexact, ML_FPE_Underflow))), Return(subnormal_result), ), Statement( ConditionBlock( Comparison( yerr_last, 0, specifier=Comparison.NotEqual, likely=True), Raise(ML_FPE_Inexact)), Return(result))))))) rnd_mode = GetRndMode() scheme = Statement(rnd_mode, SetRndMode(ML_RoundToNearest), yerr_last, SetRndMode(rnd_mode), pre_result, ClearException(), result, pre_scheme) opt_eng = OptimizationEngine(processor) # fusing FMA if fuse_fma: print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=self.precision) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) #print "silencing operation" #opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation exp_implementation.set_scheme(scheme) #print scheme.get_str(depth = None, display_precision = True) # check processor support print "checking processor support" opt_eng.check_processor_support(scheme) # factorizing fast path #opt_eng.factorize_fast_path(scheme) print "Gappa script generation" cg = CCodeGenerator(processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = exp_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") self.result.add_header("support_lib/ml_special_values.h") output_stream = open(output_file, "w") output_stream.write(self.result.get(cg)) output_stream.close() seed_var = Variable("seed", precision=self.precision, interval=Interval(0.5, 1)) cg_eval_error_copy_map = { gappa_init_approx.get_handle().get_node(): seed_var, gappa_vx.get_handle().get_node(): Variable("x", precision=self.precision, interval=Interval(1, 2)), gappa_vy.get_handle().get_node(): Variable("y", precision=self.precision, interval=Interval(1, 2)), } G1 = Constant(1, precision=ML_Exact) exact = G1 / gappa_vy exact.set_precision(ML_Exact) exact.set_tag("div_exact") gappa_goal = gappa_current_approx.get_handle().get_node() - exact gappa_goal.set_precision(ML_Exact) gappacg = GappaCodeGenerator(target, declare_cst=False, disable_debug=True) gappa_code = gappacg.get_interval_code(gappa_goal, cg_eval_error_copy_map) new_exact_node = exact.get_handle().get_node() for nr in inv_iteration_list: nr.get_hint_rules(gappacg, gappa_code, new_exact_node) seed_wrt_exact = seed_var - new_exact_node seed_wrt_exact.set_precision(ML_Exact) gappacg.add_hypothesis(gappa_code, seed_wrt_exact, Interval(-S2**-7, S2**-7)) try: eval_error = execute_gappa_script_extract( gappa_code.get(gappacg))["goal"] print "eval_error: ", eval_error except: print "error during gappa run"
def __init__(self, precision = ML_Binary32, abs_accuracy = S2**-24, libm_compliant = True, debug_flag = False, fuse_fma = True, fast_path_extract = True, target = GenericProcessor(), output_file = "expf.c", function_name = "expf"): # declaring target and instantiating optimization engine processor = target self.precision = precision opt_eng = OptimizationEngine(processor) gappacg = GappaCodeGenerator(processor, declare_cst = True, disable_debug = True) # declaring CodeFunction and retrieving input variable self.function_name = function_name exp_implementation = CodeFunction(self.function_name, output_format = self.precision) vx = exp_implementation.add_input_variable("x", self.precision) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = True, tag = "nan_or_inf") test_nan = Test(vx, specifier = Test.IsNaN, debug = True, tag = "is_nan_test") test_positive = Comparison(vx, 0, specifier = Comparison.GreaterOrEqual, debug = True, tag = "inf_sign") test_signaling_nan = Test(vx, specifier = Test.IsSignalingNaN, debug = True, tag = "is_signaling_nan") return_snan = Statement(ExpRaiseReturn(ML_FPE_Invalid, return_value = FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement(ConditionBlock(test_positive, Return(FP_PlusInfty(self.precision)), Return(FP_PlusZero(self.precision)))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock(test_nan, ConditionBlock(test_signaling_nan, return_snan, Return(FP_QNaN(self.precision))), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely = False, specifier = Comparison.Greater) early_overflow_return = Statement(ClearException(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value = FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2 ** precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely = False, specifier = Comparison.Less) early_underflow_return = Statement(ClearException(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value = FP_PlusZero(self.precision))) sollya_prec_map = {ML_Binary32: sollya.binary32, ML_Binary64: sollya.binary64} # constant computation invlog2 = round(1/log(2), sollya_prec_map[self.precision], RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - (ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: "), log2_hi_precision invlog2_cst = Constant(invlog2, precision = self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = round(log(2) - log2_hi, sollya_prec_map[self.precision], sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag = "unround_k", debug = ML_Debug(display_format = "%f")) k = NearestInteger(unround_k, precision = self.precision, debug = ML_Debug(display_format = "%f")) ik = NearestInteger(unround_k, precision = ML_Int32, debug = ML_Debug(display_format = "%d"), tag = "ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact= True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact = True) r = exact_hi_part - k * log2_lo r.set_tag("r") r.set_attributes(debug = ML_Debug(display_format = "%f")) opt_r = opt_eng.optimization_process(r, self.precision, copy = True, fuse_fma = fuse_fma) tag_map = {} opt_eng.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision = self.precision, interval = interval_vx), tag_map["k"]: Variable("k", interval = interval_k, precision = self.precision) } #try: if 1: #eval_error = gappacg.get_eval_error(opt_r, cg_eval_error_copy_map, gappa_filename = "red_arg.g") eval_error = gappacg.get_eval_error_v2(opt_eng, opt_r, cg_eval_error_copy_map, gappa_filename = "red_arg.g") Log.report(Log.Info, "eval error: %s" % eval_error) #except: # Log.report(Log.Info, "gappa error evaluation failed") print r.get_str(depth = None, display_precision = True, display_attribute = True) print opt_r.get_str(depth = None, display_precision = True, display_attribute = True) approx_interval = Interval(-log(2)/2, log(2)/2) local_ulp = sup(ulp(exp(approx_interval), self.precision)) print "ulp: ", local_ulp error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = sup(guessdegree(exp(x), approx_interval, error_goal_approx)) #- 1 init_poly_degree = poly_degree return while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error(exp(x), poly_degree, [self.precision]*(poly_degree+1), approx_interval, absolute) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report(Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, r, unified_precision = self.precision) poly.set_tag("poly") # optimizing poly before evaluation error computation opt_poly = opt_eng.optimization_process(poly, self.precision) #print "poly: ", poly.get_str(depth = None, display_precision = True) #print "opt_poly: ", opt_poly.get_str(depth = None, display_precision = True) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision = self.precision, interval = approx_interval) poly_error_copy_map = { r.get_handle().get_node(): r_gappa_var } gappacg = GappaCodeGenerator(target, declare_cst = False, disable_debug = True) poly_eval_error = gappacg.get_eval_error_v2(opt_eng, poly.get_handle().get_node(), poly_error_copy_map, gappa_filename = "gappa_poly.g") Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) global_poly_error = poly_eval_error + poly_approx_error global_rel_poly_error = global_poly_error / exp(approx_interval) print "global_poly_error: ", global_poly_error, global_rel_poly_error flag = local_ulp > sup(abs(global_rel_poly_error)) print "test: ", flag if flag: break else: if poly_degree > init_poly_degree + 5: Log.report(Log.Error, "poly degree search did not converge") poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier = Comparison.Greater, likely = False, debug = True, tag = "late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = ik - overflow_exp_offset diff_k.set_attributes(debug = ML_Debug(display_format = "%d"), tag = "diff_k") late_overflow_result = (ExponentInsertion(diff_k) * poly) * ExponentInsertion(overflow_exp_offset) late_overflow_result.set_attributes(silent = False, tag = "late_overflow_result", debug = debugf) late_overflow_return = ConditionBlock(Test(late_overflow_result, specifier = Test.IsInfty, likely = False), ExpRaiseReturn(ML_FPE_Overflow, return_value = FP_PlusInfty(self.precision)), Return(late_overflow_result)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier = Comparison.LessOrEqual, likely = False) underflow_exp_offset = 2 * self.precision.get_field_size() late_underflow_result = (ExponentInsertion(ik + underflow_exp_offset) * poly) * ExponentInsertion(-underflow_exp_offset) late_underflow_result.set_attributes(debug = ML_Debug(display_format = "%e"), tag = "late_underflow_result", silent = False) test_subnormal = Test(late_underflow_result, specifier = Test.IsSubnormal) late_underflow_return = Statement(ConditionBlock(test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value = late_underflow_result)), Return(late_underflow_result)) std_result = poly * ExponentInsertion(ik, tag = "exp_ik", debug = debug_lftolx) std_result.set_attributes(tag = "std_result", debug = debug_lftolx) result_scheme = ConditionBlock(late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result))) std_return = ConditionBlock(early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock(test_nan_or_inf, Statement(ClearException(), specific_return), std_return) #print scheme.get_str(depth = None, display_precision = True) # fusing FMA if fuse_fma: Log.report(Log.Info, "\033[33;1m MDL fusing FMA \033[0m") scheme = opt_eng.fuse_multiply_add(scheme, silence = True) Log.report(Log.Info, "\033[33;1m MDL abstract scheme \033[0m") opt_eng.instantiate_abstract_precision(scheme, None) Log.report(Log.Info, "\033[33;1m MDL instantiated scheme \033[0m") opt_eng.instantiate_precision(scheme, default_precision = self.precision) Log.report(Log.Info, "\033[33;1m subexpression sharing \033[0m") opt_eng.subexpression_sharing(scheme) Log.report(Log.Info, "\033[33;1m silencing operation \033[0m") opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation exp_implementation.set_scheme(scheme) # check processor support Log.report(Log.Info, "\033[33;1m checking processor support \033[0m") opt_eng.check_processor_support(scheme) # factorizing fast path if fast_path_extract: Log.report(Log.Info, "\033[33;1m factorizing fast path\033[0m") opt_eng.factorize_fast_path(scheme) Log.report(Log.Info, "\033[33;1m generating source code \033[0m") cg = CCodeGenerator(processor, declare_cst = False, disable_debug = not debug_flag, libm_compliant = libm_compliant) self.result = exp_implementation.get_definition(cg, C_Code, static_cst = True) #self.result.add_header("support_lib/ml_types.h") self.result.add_header("support_lib/ml_special_values.h") self.result.add_header_comment("polynomial degree for exp(x): %d" % poly_degree) self.result.add_header_comment("sollya polynomial for exp(x): %s" % poly_object.get_sollya_object()) if debug_flag: self.result.add_header("stdio.h") self.result.add_header("inttypes.h") output_stream = open(output_file, "w")#"%s.c" % exp_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def __init__(self, precision = ML_Binary32, abs_accuracy = S2**-24, libm_compliant = True, debug_flag = False, fuse_fma = True, num_iter = 3, fast_path_extract = True, target = GenericProcessor(), output_file = "__divsf3.c", function_name = "__divsf3"): # declaring CodeFunction and retrieving input variable self.precision = precision self.function_name = function_name exp_implementation = CodeFunction(self.function_name, output_format = precision) vx = exp_implementation.add_input_variable("x", precision) vy = exp_implementation.add_input_variable("y", precision) class NR_Iteration(object): def __init__(self, approx, divisor, force_fma = False): self.approx = approx self.divisor = divisor self.force_fma = force_fma if force_fma: self.error = FusedMultiplyAdd(divisor, approx, 1.0, specifier = FusedMultiplyAdd.SubtractNegate) self.new_approx = FusedMultiplyAdd(self.error, self.approx, self.approx, specifier = FusedMultiplyAdd.Standard) else: self.error = 1 - divisor * approx self.new_approx = self.approx + self.error * self.approx def get_new_approx(self): return self.new_approx def get_hint_rules(self, gcg, gappa_code, exact): divisor = self.divisor.get_handle().get_node() approx = self.approx.get_handle().get_node() new_approx = self.new_approx.get_handle().get_node() Attributes.set_default_precision(ML_Exact) if self.force_fma: rule0 = FusedMultiplyAdd(divisor, approx, 1.0, specifier = FusedMultiplyAdd.SubtractNegate) else: rule0 = 1.0 - divisor * approx rule1 = 1.0 - divisor * (approx - exact) - 1.0 rule2 = new_approx - exact subrule = approx * (2 - divisor * approx) rule3 = (new_approx - subrule) - (approx - exact) * (approx - exact) * divisor if self.force_fma: new_error = FusedMultiplyAdd(divisor, approx, 1.0, specifier = FusedMultiplyAdd.SubtractNegate) rule4 = FusedMultiplyAdd(new_error, approx, approx) else: rule4 = approx + (1 - divisor * approx) * approx Attributes.unset_default_precision() # registering hints gcg.add_hint(gappa_code, rule0, rule1) gcg.add_hint(gappa_code, rule2, rule3) gcg.add_hint(gappa_code, subrule, rule4) debugf = ML_Debug(display_format = "%f") debuglf = ML_Debug(display_format = "%lf") debugx = ML_Debug(display_format = "%x") debuglx = ML_Debug(display_format = "%lx") debugd = ML_Debug(display_format = "%d") debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_ddtolx = ML_Debug(display_format = "%\"PRIx64\" %\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format = "{.hi=%lf, .lo=%lf}", pre_process = lambda v: "%s.hi, %s.lo" % (v, v)) ex = Min(ExponentExtraction(vx, tag = "ex", debug = debugd), 1020) ey = Min(ExponentExtraction(vy, tag = "ey", debug = debugd), 1020) scaling_factor_x = ExponentInsertion(-ex) #ConditionalAllocation(Abs(ex) > 100, -ex, 0) scaling_factor_y = ExponentInsertion(-ey) #ConditionalAllocation(Abs(ey) > 100, -ey, 0) scaled_vx = vx * scaling_factor_x scaled_vy = vy * scaling_factor_y scaled_vx.set_attributes(debug = debug_lftolx, tag = "scaled_vx") scaled_vy.set_attributes(debug = debug_lftolx, tag = "scaled_vy") px = Conversion(scaled_vx, precision = ML_Binary32, tag = "px", debug=debugf) if self.precision != ML_Binary32 else vx py = Conversion(scaled_vy, precision = ML_Binary32, tag = "py", debug=debugf) if self.precision != ML_Binary32 else vy pre_init_approx = DivisionSeed(px, py, precision = ML_Binary32, tag = "seed", debug = debugf) init_approx = Conversion(pre_init_approx, precision = self.precision, tag = "seedd", debug = debug_lftolx) if self.precision != ML_Binary32 else pre_init_approx current_approx = init_approx # correctly-rounded inverse computation num_iteration = num_iter inv_iteration_list = [] Attributes.set_default_rounding_mode(ML_RoundToNearest) Attributes.set_default_silent(True) for i in range(num_iteration): new_iteration = NR_Iteration(current_approx, scaled_vy, force_fma = False if (i != num_iteration - 1) else True) inv_iteration_list.append(new_iteration) current_approx = new_iteration.get_new_approx() current_approx.set_attributes(tag = "iter_%d" % i, debug = debug_lftolx) def dividend_mult(div_approx, inv_approx, dividend, divisor, index, force_fma = False): yerr = dividend - div_approx * divisor #yerr = FMSN(div_approx, divisor, dividend) yerr.set_attributes(tag = "yerr%d" % index, debug = debug_lftolx) new_div = div_approx + yerr * inv_approx #new_div = FMA(yerr, inv_approx, div_approx) new_div.set_attributes(tag = "new_div%d" % index, debug = debug_lftolx) return new_div # multiplication correction iteration # to get correctly rounded full division current_approx.set_attributes(tag = "final_approx", debug = debug_lftolx) current_div_approx = scaled_vx * current_approx num_dividend_mult_iteration = 1 for i in range(num_dividend_mult_iteration): current_div_approx = dividend_mult(current_div_approx, current_approx, scaled_vx, scaled_vy, i) # last iteration yerr_last = FMSN(current_div_approx, scaled_vy, scaled_vx) #, clearprevious = True) Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() last_div_approx = FMA(yerr_last, current_approx, current_div_approx) yerr_last.set_attributes(tag = "yerr_last", debug = debug_lftolx) pre_result = last_div_approx pre_result.set_attributes(tag = "unscaled_div_result", debug = debug_lftolx) result = pre_result * ExponentInsertion(ex) * ExponentInsertion(-ey) result.set_attributes(tag = "result", debug = debug_lftolx) x_inf_or_nan = Test(vx, specifier = Test.IsInfOrNaN, likely = False) y_inf_or_nan = Test(vy, specifier = Test.IsInfOrNaN, likely = False, tag = "y_inf_or_nan", debug = debugd) comp_sign = Test(vx, vy, specifier = Test.CompSign, tag = "comp_sign", debug = debuglx ) x_zero = Test(vx, specifier = Test.IsZero, likely = False) y_zero = Test(vy, specifier = Test.IsZero, likely = False) y_nan = Test(vy, specifier = Test.IsNaN, likely = False) x_snan = Test(vx, specifier = Test.IsSignalingNaN, likely = False) y_snan = Test(vy, specifier = Test.IsSignalingNaN, likely = False) x_inf = Test(vx, specifier = Test.IsInfty, likely = False, tag = "x_inf") y_inf = Test(vy, specifier = Test.IsInfty, likely = False, tag = "y_inf", debug = debugd) # determining an extended precision ext_precision_map = { ML_Binary32: ML_Binary64, ML_Binary64: ML_DoubleDouble, } ext_precision = ext_precision_map[self.precision] ext_pre_result = FMA(yerr_last, current_approx, current_div_approx, precision = ext_precision, tag = "ext_pre_result", debug = debug_ddtolx) subnormal_result = None if isinstance(ext_precision, ML_Compound_FP_Format): subnormal_pre_result = SpecificOperation(ext_pre_result, ex - ey, precision = self.precision, specifier = SpecificOperation.Subnormalize, tag = "subnormal_pre_result", debug = debug_lftolx) subnormal_result = (subnormal_pre_result * ExponentInsertion(ex)) * ExponentInsertion(-ey) else: subnormal_result = Conversion(ext_pre_result * ExponentInsertion(ex - ey, tag = "final_scaling_factor", precision = ext_precision), precision = self.precision) # x inf and y inf pre_scheme = ConditionBlock(x_inf_or_nan, ConditionBlock(x_inf, ConditionBlock(y_inf_or_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)), ), ConditionBlock(comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision))) ), Statement( ConditionBlock(x_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)) ) ), ConditionBlock(x_zero, ConditionBlock(y_zero | y_nan, Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)) ), Return(vx) ), ConditionBlock(y_inf_or_nan, ConditionBlock(y_inf, Return(Select(comp_sign, FP_MinusZero(self.precision), FP_PlusZero(self.precision))), Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)) ) ), ConditionBlock(y_zero, Statement( Raise(ML_FPE_DivideByZero), ConditionBlock(comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)) ) ), ConditionBlock(Test(result, specifier = Test.IsSubnormal, likely = False), Statement( ConditionBlock(Comparison(yerr_last, 0, specifier = Comparison.NotEqual, likely = True), Statement(Raise(ML_FPE_Inexact, ML_FPE_Underflow)) ), Return(subnormal_result), ), Statement( ConditionBlock(Comparison(yerr_last, 0, specifier = Comparison.NotEqual, likely = True), Raise(ML_FPE_Inexact) ), Return(result) ) ) ) ) ) ) rnd_mode = GetRndMode() scheme = Statement(rnd_mode, SetRndMode(ML_RoundToNearest), yerr_last, SetRndMode(rnd_mode), pre_result, ClearException(), result, pre_scheme) processor = target opt_eng = OptimizationEngine(processor) # fusing FMA if fuse_fma: print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence = True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision = self.precision) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) #print "silencing operation" #opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation exp_implementation.set_scheme(scheme) #print scheme.get_str(depth = None, display_precision = True) # check processor support opt_eng.check_processor_support(scheme) # factorizing fast path #opt_eng.factorize_fast_path(scheme) cg = CCodeGenerator(processor, declare_cst = False, disable_debug = not debug_flag, libm_compliant = libm_compliant) self.result = exp_implementation.get_definition(cg, C_Code, static_cst = True) self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") self.result.add_header("support_lib/ml_special_values.h") output_stream = open(output_file, "w") output_stream.write(self.result.get(cg)) output_stream.close() seed_var = Variable("seed", precision = self.precision, interval = Interval(0.5, 1)) cg_eval_error_copy_map = { init_approx.get_handle().get_node(): seed_var, scaled_vx.get_handle().get_node(): Variable("x", precision = self.precision, interval = Interval(1, 2)), scaled_vy.get_handle().get_node(): Variable("y", precision = self.precision, interval = Interval(1, 2)), } G1 = Constant(1, precision = ML_Exact) exact = G1 / scaled_vy exact.set_precision(ML_Exact) exact.set_tag("div_exact") gappa_goal = current_approx.get_handle().get_node() - exact gappa_goal.set_precision(ML_Exact) gappacg = GappaCodeGenerator(target, declare_cst = False, disable_debug = True) gappa_code = gappacg.get_interval_code(gappa_goal, cg_eval_error_copy_map) new_exact_node = exact.get_handle().get_node() for nr in inv_iteration_list: nr.get_hint_rules(gappacg, gappa_code, new_exact_node) seed_wrt_exact = seed_var - new_exact_node seed_wrt_exact.set_precision(ML_Exact) gappacg.add_hypothesis(gappa_code, seed_wrt_exact, Interval(-S2**-7, S2**-7)) eval_error = execute_gappa_script_extract(gappa_code.get(gappacg))["goal"] print "eval_error: ", eval_error
# -*- coding: utf-8 -*- from metalibm_core.core.attributes import ML_Debug, ML_AdvancedDebug, ML_MultiDebug from metalibm_core.core.ml_formats import * # debug utilities # display single precision and double precision numbers debugf = ML_Debug(display_format = "%f") debuglf = ML_Debug(display_format = "%lf") # display hexadecimal format for integer debugx = ML_Debug(display_format = "%x") # display 64-bit hexadecimal format for integer debuglx = ML_Debug(display_format = "%\"PRIx64\"", ) # display long/int integer debugd = ML_Debug(display_format = "%d", pre_process = lambda v: "(int) %s" % v) # display long long/ long int integer debugld = ML_Debug(display_format = "%ld") debuglld = ML_Debug(display_format = "%lld") def fixed_point_pre_process(value, optree): scaling_factor = S2**-optree.get_precision().get_frac_size() return "(%e * (double)%s), %s" % (scaling_factor, value, value) debug_fixed32 = ML_AdvancedDebug(display_format = "%e(%d)", pre_process = fixed_point_pre_process)