def __init__(self, precision=ML_Binary32, abs_accuracy=S2**-24, libm_compliant=True, debug_flag=False, fuse_fma=True, fast_path_extract=True, target=GenericProcessor(), output_file="log1pf.c", function_name="log1pf"): # declaring CodeFunction and retrieving input variable self.function_name = function_name self.precision = precision self.processor = target func_implementation = CodeFunction(self.function_name, output_format=self.precision) vx = func_implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # debug utilities debugf = ML_Debug(display_format="%f") debuglf = ML_Debug(display_format="%lf") debugx = ML_Debug(display_format="%x") debuglx = ML_Debug(display_format="%\"PRIx64\"", ) debugd = ML_Debug(display_format="%d", pre_process=lambda v: "(int) %s" % v) debugld = ML_Debug(display_format="%ld") #debug_lftolx = ML_Debug(display_format = "%\"PRIx64\"", pre_process = lambda v: "double_to_64b_encoding(%s)" % v) debug_lftolx = ML_Debug( display_format="%\"PRIx64\" ev=%x", pre_process=lambda v: "double_to_64b_encoding(%s), __k1_fpu_get_exceptions()" % v) debug_ddtolx = ML_Debug( display_format="%\"PRIx64\" %\"PRIx64\"", pre_process=lambda v: "double_to_64b_encoding(%s.hi), double_to_64b_encoding(%s.lo)" % (v, v)) debug_dd = ML_Debug(display_format="{.hi=%lf, .lo=%lf}", pre_process=lambda v: "%s.hi, %s.lo" % (v, v)) # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) log2_hi_value = round( log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round( log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) int_precision = ML_Int64 if self.precision is ML_Binary64 else ML_Int32 # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_Table(dimensions=[2**table_index_size, 2], storage_precision=self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in xrange(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = (1.0 + (inv_approx_table[i][0] / S2**9)) * S2**-1 value_high = round( log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round( log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) # case close to 0: ctz ctz_exp_limit = -7 ctz_cond = vx_exp < ctz_exp_limit ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit) ctz_poly_degree = sup( guessdegree( log1p(sollya.x) / sollya.x, ctz_interval, S2** -(self.precision.get_field_size() + 1))) + 1 ctz_poly_object = Polynomial.build_from_approximation( log1p(sollya.x) / sollya.x, ctz_poly_degree, [self.precision] * (ctz_poly_degree + 1), ctz_interval, sollya.absolute) print "generating polynomial evaluation scheme" ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme( ctz_poly_object, vx, unified_precision=self.precision) ctz_poly.set_attributes(tag="ctz_poly", debug=debug_lftolx) ctz_result = vx * ctz_poly neg_input = Comparison(vx, -1, likely=False, specifier=Comparison.Less, debug=debugd, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debugd, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debugd, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debugd, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debugd, tag="vx_subnormal") log_function_code = CodeFunction( "new_log", [Variable("x", precision=ML_Binary64)], output_format=ML_Binary64) log_call_generator = FunctionOperator( log_function_code.get_name(), arity=1, output_precision=ML_Binary64, declare_prototype=log_function_code) newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64, ), ML_Binary64, log_call_generator) # case away from 0.0 pre_vxp1 = vx + 1.0 pre_vxp1.set_attributes(tag="pre_vxp1", debug=debug_lftolx) pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag="pre_vxp1_exp", debug=debugd) cm500 = Constant(-500, precision=ML_Int32) c0 = Constant(0, precision=ML_Int32) cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size() - 2) scaling_factor_exp = Select(cond_scaling, cm500, c0) scaling_factor = ExponentInsertion(scaling_factor_exp, precision=self.precision, tag="scaling_factor") vxp1 = pre_vxp1 * scaling_factor vxp1.set_attributes(tag="vxp1", debug=debug_lftolx) vxp1_exp = ExponentExtraction(vxp1, tag="vxp1_exp", debug=debugd) vxp1_inv = DivisionSeed(vxp1, precision=self.precision, tag="vxp1_inv", debug=debug_lftolx, silent=True) vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision=self.precision, tag="vxp1_dirty_inv", debug=debug_lftolx) table_index = BitLogicAnd(BitLogicRightShift( TypeCast(vxp1, precision=int_precision, debug=debuglx), self.precision.get_field_size() - 7, debug=debuglx), 0x7f, tag="table_index", debug=debuglx) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision=ML_UInt64), Constant(-2, precision=ML_UInt64), precision=ML_UInt64), precision=self.precision, tag="pre_arg_red_index", debug=debug_lftolx) arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag="arg_red_index", debug=debug_lftolx) red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index) #red_vxp1 = arg_red_index * vxp1 - 1.0 red_vxp1.set_attributes(tag="red_vxp1", debug=debug_lftolx) log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_lftolx) log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_lftolx) inv_err = S2**-6 # TODO: link to target DivisionSeed precision print "building mathematical polynomial" approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index=1) print "generating polynomial evaluation scheme" _poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, red_vxp1, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_lftolx) print global_poly_object.get_sollya_object() vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag="vxp1_inv_exp", debug=debugd) corr_exp = -vxp1_exp + scaling_factor_exp # vxp1_inv_exp #poly = (red_vxp1) * (1 + _poly) #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True) pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo)) pre_result.set_attributes(tag="pre_result", debug=debug_lftolx) exact_log2_hi_exp = -corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_exp", debug=debug_lftolx, prevent_optimization=True) #std_result = exact_log2_hi_exp + pre_result exact_log2_lo_exp = -corr_exp * log2_lo exact_log2_lo_exp.set_attributes( tag="exact_log2_lo_exp", debug=debug_lftolx) #, prevent_optimization = True) init = exact_log2_lo_exp - log_inv_lo init.set_attributes(tag="init", debug=debug_lftolx, prevent_optimization=True) fma0 = (red_vxp1 * _poly + init) # - log_inv_lo) fma0.set_attributes(tag="fma0", debug=debug_lftolx) step0 = fma0 step0.set_attributes( tag="step0", debug=debug_lftolx) #, prevent_optimization = True) step1 = step0 + red_vxp1 step1.set_attributes(tag="step1", debug=debug_lftolx, prevent_optimization=True) step2 = -log_inv_hi + step1 step2.set_attributes(tag="step2", debug=debug_lftolx, prevent_optimization=True) std_result = exact_log2_hi_exp + step2 std_result.set_attributes(tag="std_result", debug=debug_lftolx, prevent_optimization=True) # main scheme print "MDL scheme" pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, Return(vx), ConditionBlock(ctz_cond, Statement(Return(ctz_result), ), Statement(Return(std_result)))))) scheme = pre_scheme #print scheme.get_str(depth = None, display_precision = True) opt_eng = OptimizationEngine(self.processor) # fusing FMA print "MDL fusing FMA" scheme = opt_eng.fuse_multiply_add(scheme, silence=True) print "MDL abstract scheme" opt_eng.instantiate_abstract_precision(scheme, None) #print scheme.get_str(depth = None, display_precision = True) print "MDL instantiated scheme" opt_eng.instantiate_precision(scheme, default_precision=ML_Binary32) print "subexpression sharing" opt_eng.subexpression_sharing(scheme) print "silencing operation" opt_eng.silence_fp_operations(scheme) # registering scheme as function implementation func_implementation.set_scheme(scheme) # check processor support opt_eng.check_processor_support(scheme) # factorizing fast path opt_eng.factorize_fast_path(scheme) #print scheme.get_str(depth = None, display_precision = True) cg = CCodeGenerator(self.processor, declare_cst=False, disable_debug=not debug_flag, libm_compliant=libm_compliant) self.result = func_implementation.get_definition(cg, C_Code, static_cst=True) self.result.add_header("support_lib/ml_special_values.h") self.result.add_header("math.h") self.result.add_header("stdio.h") self.result.add_header("inttypes.h") #print self.result.get(cg) output_stream = open("%s.c" % func_implementation.get_name(), "w") output_stream.write(self.result.get(cg)) output_stream.close()
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.precision) sollya_precision = self.get_input_precision().sollya_object # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) # 2-limb approximation of log(2) # hi part precision is reduced to provide exact operation # when multiplied by an exponent value log2_hi_value = round(log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round(log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) int_precision = self.precision.get_integer_format() # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision = self.precision) dummy_rcp_seed = ReciprocalSeed(dummy_var, precision = self.precision) inv_approx_table = self.processor.get_recursive_implementation(dummy_rcp_seed, language = None, table_getter = lambda self: self.approx_table_map) # table creation table_index_size = inv_approx_table.index_size log_table = ML_NewTable(dimensions = [2**table_index_size, 2], storage_precision = self.precision) # storing accurate logarithm approximation of value returned # by the fast reciprocal operation for i in range(0, 2**table_index_size): inv_value = inv_approx_table[i] value_high = round(log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round(log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low neg_input = Comparison(vx, -1, likely=False, precision=ML_Bool, specifier=Comparison.Less, debug=debug_multi, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, precision=ML_Bool, debug=debug_multi, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debug_multi, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debug_multi, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debug_multi, tag="vx_subnormal") # for x = m.2^e, such that e >= 0 # # log(1+x) = log(1 + m.2^e) # = log(2^e . 2^-e + m.2^e) # = log(2^e . (2^-e + m)) # = log(2^e) + log(2^-e + m) # = e . log(2) + log (2^-e + m) # # t = (2^-e + m) # t = m_t . 2^e_t # r ~ 1 / m_t => r.m_t ~ 1 ~ 0 # # t' = t . 2^-e_t # = 2^-e-e_t + m . 2^-e_t # # if e >= 0, then 2^-e <= 1, then 1 <= m + 2^-e <= 3 # r = m_r . 2^e_r # # log(1+x) = e.log(2) + log(r . 2^e_t . 2^-e_t . (2^-e + m) / r) # = e.log(2) + log(r . 2^(-e-e_t) + r.m.2^-e_t) + e_t . log(2)- log(r) # = (e+e_t).log(2) + log(r . t') - log(r) # = (e+e_t).log(2) + log(r . t') - log(r) # = (e+e_t).log(2) + P_log1p(r . t' - 1) - log(r) # # # argument reduction m = MantissaExtraction(vx, tag="vx", precision=self.precision, debug=debug_multi) e = ExponentExtraction(vx, tag="e", precision=int_precision, debug=debug_multi) # 2^-e TwoMinusE = ExponentInsertion(-e, tag="Two_minus_e", precision=self.precision, debug=debug_multi) t = Addition(TwoMinusE, m, precision=self.precision, tag="t", debug=debug_multi) m_t = MantissaExtraction(t, tag="m_t", precision=self.precision, debug=debug_multi) e_t = ExponentExtraction(t, tag="e_t", precision=int_precision, debug=debug_multi) # 2^(-e-e_t) TwoMinusEEt = ExponentInsertion(-e-e_t, tag="Two_minus_e_et", precision=self.precision) TwoMinusEt = ExponentInsertion(-e_t, tag="Two_minus_et", precision=self.precision, debug=debug_multi) rcp_mt = ReciprocalSeed(m_t, tag="rcp_mt", precision=self.precision, debug=debug_multi) INDEX_SIZE = table_index_size table_index = generic_mantissa_msb_index_fct(INDEX_SIZE, m_t) table_index.set_attributes(tag="table_index", debug=debug_multi) log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_multi) log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_multi) inv_err = S2**-6 # TODO: link to target DivisionSeed precision Log.report(Log.Info, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) approx_fct = sollya.log1p(sollya.x) / (sollya.x) poly_degree = sup(guessdegree(approx_fct, approx_interval, S2**-(self.precision.get_field_size()+1))) + 1 Log.report(Log.Debug, "poly_degree is {}", poly_degree) global_poly_object = Polynomial.build_from_approximation(approx_fct, poly_degree, [self.precision]*(poly_degree+1), approx_interval, sollya.absolute) poly_object = global_poly_object # .sub_poly(start_index=1) EXT_PRECISION_MAP = { ML_Binary32: ML_SingleSingle, ML_Binary64: ML_DoubleDouble, ML_SingleSingle: ML_TripleSingle, ML_DoubleDouble: ML_TripleDouble } if not self.precision in EXT_PRECISION_MAP: Log.report(Log.Error, "no extended precision available for {}", self.precision) ext_precision = EXT_PRECISION_MAP[self.precision] # pre_rtp = r . 2^(-e-e_t) + m .2^-e_t pre_rtp = Addition( rcp_mt * TwoMinusEEt, Multiplication( rcp_mt, Multiplication( m, TwoMinusEt, precision=self.precision, tag="pre_mult", debug=debug_multi, ), precision=ext_precision, tag="pre_mult2", debug=debug_multi, ), precision=ext_precision, tag="pre_rtp", debug=debug_multi ) pre_red_vx = Addition( pre_rtp, -1, precision=ext_precision, ) red_vx = Conversion(pre_red_vx, precision=self.precision, tag="red_vx", debug=debug_multi) Log.report(Log.Info, "generating polynomial evaluation scheme") poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, red_vx, unified_precision=self.precision) poly.set_attributes(tag="poly", debug=debug_multi) Log.report(Log.Debug, "{}", global_poly_object.get_sollya_object()) fp_e = Conversion(e + e_t, precision=self.precision, tag="fp_e", debug=debug_multi) ext_poly = Multiplication(red_vx, poly, precision=ext_precision) pre_result = Addition( Addition( fp_e * log2_hi, fp_e * log2_lo, precision=ext_precision ), Addition( Addition( -log_inv_hi, -log_inv_lo, precision=ext_precision ), ext_poly, precision=ext_precision ), precision=ext_precision ) result = Conversion(pre_result, precision=self.precision, tag="result", debug=debug_multi) # main scheme Log.report(Log.Info, "MDL scheme") pre_scheme = ConditionBlock(neg_input, Statement( ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision)) ), ConditionBlock(vx_nan_or_inf, ConditionBlock(vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement( ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid) ), Return(FP_QNaN(self.precision)) ) ), Return(result) ) ) scheme = pre_scheme return scheme
def generate_scheme(self): """Produce an abstract scheme for the logarithm. This abstract scheme will be used by the code generation backend. """ if self.precision not in [ML_Binary32, ML_Binary64]: Log.report(Log.Error, "The demanded precision is not supported") vx = self.implementation.add_input_variable("x", self.precision) def default_bool_convert(optree, precision=None, **kw): return bool_convert(optree, precision, -1, 0, **kw) \ if isinstance(self.processor, VectorBackend) \ else bool_convert(optree, precision, 1, 0, **kw) precision = self.precision.sollya_object int_prec = self.precision.get_integer_format() Log.report(Log.Info, "int_prec is %s" % int_prec) uint_prec = self.precision.get_unsigned_integer_format() Log.report(Log.Info, "MDL constants") cgpe_scheme_idx = int(self.cgpe_index) table_index_size = int(self.tbl_index_size) # table_nb_elements = 2**(table_index_size) table_dimensions = [2*table_nb_elements] # two values are stored for each element field_size = Constant(self.precision.get_field_size(), precision = int_prec, tag = 'field_size') if self.log_radix == EXP_1: log2_hi = Constant( round(log(2), precision, sollya.RN), precision = self.precision, tag = 'log2_hi') log2_lo = Constant( round(log(2) - round(log(2), precision, sollya.RN), precision, sollya.RN), precision = self.precision, tag = 'log2_lo') elif self.log_radix == 10: log2_hi = Constant( round(log10(2), precision, sollya.RN), precision = self.precision, tag = 'log2_hi') log2_lo = Constant( round(log10(2) - round(log10(2), precision, sollya.RN), precision, sollya.RN), precision = self.precision, tag = 'log2_lo') # ... if log_radix == '2' then log2(2) == 1 # subnormal_mask aims at trapping positive subnormals except zero. # That's why we will subtract 1 to the integer bitstring of the input, and # then compare for Less (strict) the resulting integer bitstring to this # mask, e.g. 0x7fffff for binary32. if self.no_subnormal == False: subnormal_mask = Constant((1 << self.precision.get_field_size()) - 1, precision = int_prec, tag = 'subnormal_mask') fp_one = Constant(1.0, precision = self.precision, tag = 'fp_one') fp_one_as_uint = TypeCast(fp_one, precision = uint_prec, tag = 'fp_one_as_uint') int_zero = Constant(0, precision = int_prec, tag = 'int_zero') int_one = Constant(1, precision = int_prec, tag = 'int_one') table_mantissa_half_ulp = Constant( 1 << (self.precision.field_size - table_index_size - 1), precision = int_prec ) table_s_exp_index_mask = Constant( ~((table_mantissa_half_ulp.get_value() << 1) - 1), precision = uint_prec ) Log.report(Log.Info, "MDL table") # The table holds approximations of -log(2^tau * r_i) so we first compute # the index value for which tau changes from 1 to 0. cut = sqrt(2.) tau_index_limit = floor(table_nb_elements * (2./cut - 1)) sollya_logtbl = [ (-log1p(float(i) / table_nb_elements) + (0 if i <= tau_index_limit else log(2.))) / log(self.log_radix) for i in range(table_nb_elements) ] # ... init_logtbl_hi = [ round(sollya_logtbl[i], self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] init_logtbl_lo = [ round(sollya_logtbl[i] - init_logtbl_hi[i], self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] init_logtbl = [tmp[i] for i in range(len(init_logtbl_hi)) for tmp in [init_logtbl_hi, init_logtbl_lo]] log1p_table = ML_NewTable(dimensions = table_dimensions, storage_precision = self.precision, init_data = init_logtbl, tag = 'ml_log1p_table') # ... if self.no_rcp: sollya_rcptbl = [ (1/((1+float(i)/table_nb_elements)+2**(-1-int(self.tbl_index_size)))) for i in range(table_nb_elements) ] init_rcptbl = [ round(sollya_rcptbl[i], int(self.tbl_index_size)+1, # self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] rcp_table = ML_NewTable(dimensions = [table_nb_elements], storage_precision = self.precision, init_data = init_rcptbl, tag = 'ml_rcp_table') # ... Log.report(Log.Info, 'MDL unified subnormal handling') vx_as_int = TypeCast(vx, precision = int_prec, tag = 'vx_as_int') if self.no_subnormal == False: vx_as_uint = TypeCast(vx, precision = uint_prec, tag = 'vx_as_uint') # Avoid the 0.0 case by subtracting 1 from vx_as_int tmp = Comparison(vx_as_int - 1, subnormal_mask, specifier = Comparison.Less) is_subnormal = default_bool_convert( tmp, # Will catch negative values as well as NaNs with sign bit set precision = int_prec) is_subnormal.set_attributes(tag = "is_subnormal") if not(isinstance(self.processor, VectorBackend)): is_subnormal = Subtraction(Constant(0, precision = int_prec), is_subnormal, precision = int_prec) ################################################# # Vectorizable integer based subnormal handling # ################################################# # 1. lzcnt # custom lzcount-like for subnormal numbers using FPU (see draft article) Zi = BitLogicOr(vx_as_uint, fp_one_as_uint, precision = uint_prec, tag="Zi") Zf = Subtraction( TypeCast(Zi, precision = self.precision), fp_one, precision = self.precision, tag="Zf") # Zf exponent is -(nlz(x) - exponent_size). # 2. compute shift value # Vectorial comparison on x86+sse/avx is going to look like # '|0x00|0xff|0x00|0x00|' and that's why we use Negate. # But for scalar code generation, comparison will rather be either 0 or 1 # in C. Thus mask below won't be correct for a scalar implementation. # FIXME: Can we know the backend that will be called and choose in # consequence? Should we make something arch-agnostic instead? # n_value = BitLogicAnd( Addition( DirtyExponentExtraction(Zf, self.precision), Constant( self.precision.get_bias(), precision = int_prec), precision = int_prec), is_subnormal, precision = int_prec, tag = "n_value") alpha = Negation(n_value, tag="alpha") # # 3. shift left # renormalized_mantissa = BitLogicLeftShift(vx_as_int, value) normal_vx_as_int = BitLogicLeftShift(vx_as_int, alpha) # 4. set exponent to the right value # Compute the exponent to add : (p-1)-(value) + 1 = p-1-value # The final "+ 1" comes from the fact that once renormalized, the # floating-point datum has a biased exponent of 1 #tmp0 = Subtraction( # field_size, # value, # precision = int_prec, # tag="tmp0") # Set the value to 0 if the number is not subnormal #tmp1 = BitLogicAnd(tmp0, is_subnormal) #renormalized_exponent = BitLogicLeftShift( # tmp1, # field_size # ) else: # no_subnormal == True normal_vx_as_int = vx_as_int #normal_vx_as_int = renormalized_mantissa + renormalized_exponent normal_vx = TypeCast(normal_vx_as_int, precision = self.precision, tag = 'normal_vx') # alpha = BitLogicAnd(field_size, is_subnormal, tag = 'alpha') # XXX Extract the mantissa, see if this is supported in the x86 vector # backend or if it still uses the support_lib. vx_mantissa = MantissaExtraction(normal_vx, precision = self.precision) Log.report(Log.Info, "MDL scheme") if self.force_division == True: rcp_m = Division(fp_one, vx_mantissa, precision = self.precision) elif self.no_rcp == False: rcp_m = ReciprocalSeed(vx_mantissa, precision = self.precision) if not self.processor.is_supported_operation(rcp_m): if self.precision == ML_Binary64: # Try using a binary32 FastReciprocal binary32_m = Conversion(vx_mantissa, precision = ML_Binary32) rcp_m = ReciprocalSeed(binary32_m, precision = ML_Binary32) rcp_m = Conversion(rcp_m, precision = ML_Binary64) if not self.processor.is_supported_operation(rcp_m): # FIXME An approximation table could be used instead but for vector # implementations another GATHER would be required. # However this may well be better than a division... rcp_m = Division(fp_one, vx_mantissa, precision = self.precision) else: # ... use a look-up table rcp_shift = BitLogicLeftShift(normal_vx_as_int, self.precision.get_exponent_size() + 1) rcp_idx = BitLogicRightShift(rcp_shift, self.precision.get_exponent_size() + 1 + self.precision.get_field_size() - int(self.tbl_index_size)) rcp_m = TableLoad(rcp_table, rcp_idx, tag = 'rcp_idx', debug = debug_multi) # rcp_m.set_attributes(tag = 'rcp_m') # exponent is normally either 0 or -1, since m is in [1, 2). Possible # optimization? # exponent = ExponentExtraction(rcp_m, precision = self.precision, # tag = 'exponent') ri_round = TypeCast( Addition( TypeCast(rcp_m, precision = int_prec), table_mantissa_half_ulp, precision = int_prec ), precision = uint_prec ) ri_fast_rndn = BitLogicAnd( ri_round, table_s_exp_index_mask, tag = 'ri_fast_rndn', precision = uint_prec ) # u = m * ri - 1 ul = None if self.no_rcp == True: # ... u does not fit on a single word tmp_u, tmp_ul = Mul211(vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fma = (self.no_fma == False)) fp_minus_one = Constant(-1.0, precision = self.precision, tag = 'fp_minus_one') u, ul = Add212(fp_minus_one, tmp_u, tmp_ul) u.set_attributes(tag='uh') ul.set_attributes(tag='ul') elif self.no_fma == False: u = FusedMultiplyAdd( vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fp_one, specifier = FusedMultiplyAdd.Subtract, tag = 'u') else: # disable FMA # tmph + tmpl = m * ri, where tmph ~ 1 tmph, tmpl = Mul211(vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fma = False) # u_tmp = tmph - 1 ... exact due to Sterbenz u_tmp = Subtraction(tmph, fp_one, precision = self.precision) # u = u_tmp - tmpl ... exact since the result u is representable as a single word u = Addition(u_tmp, tmpl, precision = self.precision, tag = 'u') unneeded_bits = Constant( self.precision.field_size - table_index_size, precision=uint_prec, tag="unneeded_bits" ) assert self.precision.field_size - table_index_size >= 0 ri_bits = BitLogicRightShift( ri_fast_rndn, unneeded_bits, precision = uint_prec, tag = "ri_bits" ) # Retrieve mantissa's MSBs + first bit of exponent, for tau computation in case # exponent is 0 (i.e. biased 127, i.e. first bit of exponent is set.). # In this particular case, i = 0 but tau is 1 # table_index does not need to be as long as uint_prec might be, # try and keep it the size of size_t. size_t_prec = ML_UInt32 signed_size_t_prec = ML_Int32 table_index_mask = Constant( (1 << (table_index_size + 1)) - 1, precision = size_t_prec ) table_index = BitLogicAnd( Conversion(ri_bits, precision = size_t_prec), table_index_mask, tag = 'table_index', precision = size_t_prec ) # Compute tau using the tau_index_limit value. tmp = default_bool_convert( Comparison( TypeCast(table_index, precision = signed_size_t_prec), Constant(tau_index_limit, precision = signed_size_t_prec), specifier = Comparison.Greater if isinstance(self.processor, VectorBackend) else Comparison.LessOrEqual ), precision = signed_size_t_prec, tag="tmp" ) # A true tmp will typically be -1 for VectorBackends, but 1 for standard C. tau = Conversion( Addition(tmp, Constant(1, precision=signed_size_t_prec), precision = signed_size_t_prec, tag="pre_add") if isinstance(self.processor, VectorBackend) else tmp, precision=int_prec, tag="pre_tau" ) tau.set_attributes(tag = 'tau') # Update table_index: keep only table_index_size bits table_index_hi = BitLogicAnd( table_index, Constant((1 << table_index_size) - 1, precision = size_t_prec), precision = size_t_prec ) # table_index_hi = table_index_hi << 1 table_index_hi = BitLogicLeftShift( table_index_hi, Constant(1, precision = size_t_prec), precision = size_t_prec, tag = "table_index_hi" ) # table_index_lo = table_index_hi + 1 table_index_lo = Addition( table_index_hi, Constant(1, precision = size_t_prec), precision = size_t_prec, tag = "table_index_lo" ) tbl_hi = TableLoad(log1p_table, table_index_hi, tag = 'tbl_hi', debug = debug_multi) tbl_lo = TableLoad(log1p_table, table_index_lo, tag = 'tbl_lo', debug = debug_multi) # Compute exponent e + tau - alpha, but first subtract the bias. if self.no_subnormal == False: tmp_eptau = Addition( Addition( BitLogicRightShift( normal_vx_as_int, field_size, tag = 'exponent', interval = self.precision.get_exponent_interval(), precision = int_prec), Constant( self.precision.get_bias(), precision = int_prec)), tau, tag = 'tmp_eptau', precision = int_prec) exponent = Subtraction(tmp_eptau, alpha, precision = int_prec) else: exponent = Addition( Addition( BitLogicRightShift( normal_vx_as_int, field_size, tag = 'exponent', interval = self.precision.get_exponent_interval(), precision = int_prec), Constant( self.precision.get_bias(), precision = int_prec)), tau, tag = 'tmp_eptau', precision = int_prec) # fp_exponent = Conversion(exponent, precision = self.precision, tag = 'fp_exponent') Log.report(Log.Info, 'MDL polynomial approximation') if self.log_radix == EXP_1: sollya_function = log(1 + sollya.x) elif self.log_radix == 2: sollya_function = log2(1 + sollya.x) elif self.log_radix == 10: sollya_function = log10(1 + sollya.x) # ... if self.force_division == True: # rcp accuracy is 2^(-p) boundrcp = 2**(-self.precision.get_precision()) else: boundrcp = 1.5 * 2**(-12) # ... see Intel intrinsics guide if self.precision in [ML_Binary64]: if not self.processor.is_supported_operation(rcp_m): boundrcp = (1+boundrcp)*(1+2**(-24)) - 1 else: boundrcp = 2**(-14) # ... see Intel intrinsics guide arg_red_mag = boundrcp + 2**(-table_index_size-1) + boundrcp * 2**(-table_index_size-1) if self.no_rcp == False: approx_interval = Interval(-arg_red_mag, arg_red_mag) else: approx_interval = Interval(-2**(-int(self.tbl_index_size)+1),2**(-int(self.tbl_index_size)+1)) max_eps = 2**-(2*(self.precision.get_field_size())) Log.report(Log.Info, "max acceptable error for polynomial = {}".format(float.hex(max_eps))) poly_degree = sup( guessdegree( sollya_function, approx_interval, max_eps, ) ) Log.report(Log.Info, "poly degree is ", poly_degree) if self.log_radix == EXP_1: poly_object = Polynomial.build_from_approximation( sollya_function, range(2, int(poly_degree) + 1), # Force 1st 2 coeffs to 0 and 1, resp. # Emulate double-self.precision coefficient formats [self.precision.get_mantissa_size()*2 + 1]*(poly_degree - 1), approx_interval, sollya.absolute, 0 + sollya._x_) # Force the first 2 coefficients to 0 and 1, resp. else: # ... == '2' or '10' poly_object = Polynomial.build_from_approximation( sollya_function, range(1, int(poly_degree) + 1), # Force 1st coeff to 0 # Emulate double-self.precision coefficient formats [self.precision.get_mantissa_size()*2 + 1]*(poly_degree), approx_interval, sollya.absolute, 0) # Force the first coefficients to 0 Log.report(Log.Info, str(poly_object)) constant_precision = ML_SingleSingle if self.precision == ML_Binary32 \ else ML_DoubleDouble if self.precision == ML_Binary64 \ else None if is_cgpe_available(): log1pu_poly = PolynomialSchemeEvaluator.generate_cgpe_scheme( poly_object, u, unified_precision = self.precision, constant_precision = constant_precision, scheme_id = cgpe_scheme_idx ) else: Log.report(Log.Warning, "CGPE not available, falling back to std poly evaluator") log1pu_poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, u, unified_precision = self.precision, constant_precision = constant_precision ) # XXX Dirty implementation of double-(self.precision) poly def dirty_poly_node_conversion(node, variable_h, variable_l, use_fma): return dirty_multi_node_expand( node, self.precision, mem_map={variable_h: (variable_h, variable_l)}, fma=use_fma) log1pu_poly_hi, log1pu_poly_lo = dirty_poly_node_conversion(log1pu_poly, u, ul, use_fma=(self.no_fma == False)) log1pu_poly_hi.set_attributes(tag = 'log1pu_poly_hi') log1pu_poly_lo.set_attributes(tag = 'log1pu_poly_lo') # Compute log(2) * (e + tau - alpha) if self.log_radix != 2: # 'e' or '10' log2e_hi, log2e_lo = Mul212(fp_exponent, log2_hi, log2_lo, fma = (self.no_fma == False)) # Add log1p(u) if self.log_radix != 2: # 'e' or '10' tmp_res_hi, tmp_res_lo = Add222(log2e_hi, log2e_lo, log1pu_poly_hi, log1pu_poly_lo) else: tmp_res_hi, tmp_res_lo = Add212(fp_exponent, log1pu_poly_hi, log1pu_poly_lo) # Add -log(2^(tau)/m) approximation retrieved by two table lookups logx_hi = Add122(tmp_res_hi, tmp_res_lo, tbl_hi, tbl_lo)[0] logx_hi.set_attributes(tag = 'logx_hi') scheme = Return(logx_hi, precision = self.precision) return scheme
def numeric_emulate(self, input_value): return log1p(input_value)
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.precision) sollya_precision = self.get_input_precision().sollya_object # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) log2_hi_value = round(log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round(log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision = self.precision) log2_lo = Constant(log2_lo_value, precision = self.precision) vx_exp = ExponentExtraction(vx, tag = "vx_exp", debug = debugd) int_precision = self.precision.get_integer_format() # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision = self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision = self.precision) inv_approx_table = self.processor.get_recursive_implementation(dummy_div_seed, language = None, table_getter = lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_NewTable(dimensions = [2**table_index_size, 2], storage_precision = self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in range(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = inv_approx_table[i] # (1.0 + (inv_approx_table[i] / S2**9) ) * S2**-1 value_high = round(log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round(log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low vx_exp = ExponentExtraction(vx, tag = "vx_exp", debug = debugd) # case close to 0: ctz ctz_exp_limit = -7 ctz_cond = vx_exp < ctz_exp_limit ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit) ctz_poly_degree = sup(guessdegree(log1p(sollya.x)/sollya.x, ctz_interval, S2**-(self.precision.get_field_size()+1))) + 1 ctz_poly_object = Polynomial.build_from_approximation(log1p(sollya.x)/sollya.x, ctz_poly_degree, [self.precision]*(ctz_poly_degree+1), ctz_interval, sollya.absolute) Log.report(Log.Info, "generating polynomial evaluation scheme") ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme(ctz_poly_object, vx, unified_precision = self.precision) ctz_poly.set_attributes(tag = "ctz_poly", debug = debug_lftolx) ctz_result = vx * ctz_poly neg_input = Comparison(vx, -1, likely = False, specifier = Comparison.Less, debug = debugd, tag = "neg_input") vx_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = debugd, tag = "nan_or_inf") vx_snan = Test(vx, specifier = Test.IsSignalingNaN, likely = False, debug = debugd, tag = "snan") vx_inf = Test(vx, specifier = Test.IsInfty, likely = False, debug = debugd, tag = "inf") vx_subnormal = Test(vx, specifier = Test.IsSubnormal, likely = False, debug = debugd, tag = "vx_subnormal") log_function_code = CodeFunction("new_log", [Variable("x", precision = ML_Binary64)], output_format = ML_Binary64) log_call_generator = FunctionOperator(log_function_code.get_name(), arity = 1, output_precision = ML_Binary64, declare_prototype = log_function_code) newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64,), ML_Binary64, log_call_generator) # case away from 0.0 pre_vxp1 = vx + 1.0 pre_vxp1.set_attributes(tag = "pre_vxp1", debug = debug_lftolx) pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag = "pre_vxp1_exp", debug = debugd) cm500 = Constant(-500, precision = ML_Int32) c0 = Constant(0, precision = ML_Int32) cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size()-2) scaling_factor_exp = Select(cond_scaling, cm500, c0) scaling_factor = ExponentInsertion(scaling_factor_exp, precision = self.precision, tag = "scaling_factor") vxp1 = pre_vxp1 * scaling_factor vxp1.set_attributes(tag = "vxp1", debug = debug_lftolx) vxp1_exp = ExponentExtraction(vxp1, tag = "vxp1_exp", debug = debugd) vxp1_inv = ReciprocalSeed(vxp1, precision = self.precision, tag = "vxp1_inv", debug = debug_lftolx, silent = True) vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision = self.precision, tag = "vxp1_dirty_inv", debug = debug_lftolx) table_index = BitLogicAnd(BitLogicRightShift(TypeCast(vxp1, precision = int_precision, debug = debuglx), self.precision.get_field_size() - 7, debug = debuglx), 0x7f, tag = "table_index", debug = debuglx) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision = ML_UInt64), Constant(-2, precision = ML_UInt64), precision = ML_UInt64), precision = self.precision, tag = "pre_arg_red_index", debug = debug_lftolx) arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag = "arg_red_index", debug = debug_lftolx) red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index) #red_vxp1 = arg_red_index * vxp1 - 1.0 red_vxp1.set_attributes(tag = "red_vxp1", debug = debug_lftolx) log_inv_lo = TableLoad(log_table, table_index, 1, tag = "log_inv_lo", debug = debug_lftolx) log_inv_hi = TableLoad(log_table, table_index, 0, tag = "log_inv_hi", debug = debug_lftolx) inv_err = S2**-6 # TODO: link to target DivisionSeed precision Log.report(Log.Info, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup(guessdegree(log(1+sollya.x)/sollya.x, approx_interval, S2**-(self.precision.get_field_size()+1))) + 1 global_poly_object = Polynomial.build_from_approximation(log(1+sollya.x)/sollya.x, poly_degree, [self.precision]*(poly_degree+1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index = 1) Log.report(Log.Info, "generating polynomial evaluation scheme") _poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, red_vxp1, unified_precision = self.precision) _poly.set_attributes(tag = "poly", debug = debug_lftolx) Log.report(Log.Info, global_poly_object.get_sollya_object()) vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag = "vxp1_inv_exp", debug = debugd) corr_exp = Conversion(-vxp1_exp + scaling_factor_exp, precision = self.precision)# vxp1_inv_exp #poly = (red_vxp1) * (1 + _poly) #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True) pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo)) pre_result.set_attributes(tag = "pre_result", debug = debug_lftolx) exact_log2_hi_exp = - corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag = "exact_log2_hi_exp", debug = debug_lftolx, prevent_optimization = True) #std_result = exact_log2_hi_exp + pre_result exact_log2_lo_exp = - corr_exp * log2_lo exact_log2_lo_exp.set_attributes(tag = "exact_log2_lo_exp", debug = debug_lftolx)#, prevent_optimization = True) init = exact_log2_lo_exp - log_inv_lo init.set_attributes(tag = "init", debug = debug_lftolx, prevent_optimization = True) fma0 = (red_vxp1 * _poly + init) # - log_inv_lo) fma0.set_attributes(tag = "fma0", debug = debug_lftolx) step0 = fma0 step0.set_attributes(tag = "step0", debug = debug_lftolx) #, prevent_optimization = True) step1 = step0 + red_vxp1 step1.set_attributes(tag = "step1", debug = debug_lftolx, prevent_optimization = True) step2 = -log_inv_hi + step1 step2.set_attributes(tag = "step2", debug = debug_lftolx, prevent_optimization = True) std_result = exact_log2_hi_exp + step2 std_result.set_attributes(tag = "std_result", debug = debug_lftolx, prevent_optimization = True) # main scheme Log.report(Log.Info, "MDL scheme") pre_scheme = ConditionBlock(neg_input, Statement( ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision)) ), ConditionBlock(vx_nan_or_inf, ConditionBlock(vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement( ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid) ), Return(FP_QNaN(self.precision)) ) ), ConditionBlock(vx_subnormal, Return(vx), ConditionBlock(ctz_cond, Statement( Return(ctz_result), ), Statement( Return(std_result) ) ) ) ) ) scheme = pre_scheme return scheme