def generate_scheme(self): #func_implementation = CodeFunction(self.function_name, output_format = self.precision) vx = self.implementation.add_input_variable("x", self.precision) table_size = 16 row_size = 2 new_table = ML_NewTable(dimensions=[table_size, row_size], storage_precision=self.precision) for i in range(table_size): new_table[i][0] = i new_table[i][1] = i + 1 index = Modulo(vx, Constant(table_size, precision=ML_Int32), precision=ML_Int32) load_value_lo = TableLoad(new_table, index, Constant(0, precision=ML_Int32), precision=self.precision) load_value_hi = TableLoad(new_table, index, Constant(1, precision=ML_Int32), precision=self.precision) Log.report(Log.Info, "table interval: {}".format(new_table.get_interval())) out_table = ML_NewTable(dimensions=[table_size], storage_precision=self.precision, empty=True) result = Addition(load_value_lo, load_value_hi, precision=self.precision) scheme = Statement( TableStore( result, out_table, Constant(13, precision=ML_Int32), precision=ML_Void, ), Return( TableLoad(out_table, Constant(13, precision=ML_Int32), precision=self.precision), precision=self.precision, )) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", ML_Binary32) px = self.implementation.add_input_variable("px", ML_Binary32_p) result = vx * vx # pointer dereferencing and value assignment px_assign = ReferenceAssign(Dereference(px, precision=ML_Binary32), result) # pointer to pointer cast py = Variable("py", precision=ML_Binary64_p, vartype=Variable.Local) py_assign = ReferenceAssign(py, TypeCast(px, precision=ML_Binary64_p)) table_size = 16 row_size = 2 new_table = ML_NewTable(dimensions=[table_size, row_size], storage_precision=self.precision) for i in range(table_size): new_table[i][0] = i new_table[i][1] = i + 1 # cast between table and pointer pz = Variable("pz", precision=ML_Pointer_Format(self.precision), vartype=Variable.Local) pz_assign = ReferenceAssign( pz, TypeCast(new_table, precision=ML_Binary64_p)) scheme = Statement(px_assign, py_assign, pz_assign) return scheme
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", self.precision) add_xx = Addition(vx, vx, precision=self.precision) mult = Multiplication(add_xx, vx, precision=self.precision) cst = Constant(1.1, precision=self.precision) index_size = 4 table_size = 2**index_size table = ML_NewTable(dimensions=[table_size], storage_precision=self.precision) for i in range(table_size): table[i] = i index = NearestInteger(vx, precision=ML_Int32) # index = index % table_size = index & (2**index_size - 1) index = BitLogicAnd(index, Constant(2**index_size - 1, precision=ML_Int32), precision=ML_Int32) index = BitLogicRightShift(index, Constant(1, precision=ML_Int32), precision=ML_Int32) table_value = TableLoad(table, index, precision=self.precision) int_tree = Multiplication(index, Addition(index, Constant(7, precision=ML_Int32), precision=ML_Int32), precision=ML_Int32) result = Multiplication( table_value, FusedMultiplyAdd(Addition(cst, Conversion(int_tree, precision=self.precision), precision=self.precision), mult, add_xx, specifier=FusedMultiplyAdd.Subtract, precision=self.precision), precision=self.precision, tag="result") scheme = Return(result, precision=self.precision, debug=debug_multi) # conv_pass = Pass_M128_Promotion(self.processor) # new_scheme = conv_pass.execute(scheme) return scheme
def generate_log_table(self, log_f, inv_approx_table): """ generate 2 tables: log_table[i] = 2-word unevaluated sum approximation of log_f(inv_approx_table[i]) log_table_tho[i] = 2-word unevaluated sum approximation of log_f(2*inv_approx_table[i]) """ sollya_precision = self.get_input_precision().get_sollya_object() # table creation table_index_size = inv_approx_table.index_size table_index_range = range(1, 2**table_index_size) log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, const=True) log_table_tho = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, const=True) log_table[0][0] = 0.0 log_table[0][1] = 0.0 log_table_tho[0][0] = 0.0 log_table_tho[0][1] = 0.0 hi_size = self.precision.get_field_size() - ( self.precision.get_exponent_size() + 1) for i in table_index_range: inv_value = inv_approx_table[i] value_high = round(log_f(inv_value), hi_size, sollya.RN) value_low = round( log_f(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low inv_value_tho = S2 * inv_approx_table[i] value_high_tho = round(log_f(inv_value_tho), hi_size, sollya.RN) value_low_tho = round( log_f(inv_value_tho) - value_high_tho, sollya_precision, sollya.RN) log_table_tho[i][0] = value_high_tho log_table_tho[i][1] = value_low_tho return log_table, log_table_tho, table_index_range
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", self.get_input_precision(0)) bf16_params = ML_NewTable(dimensions=[self.table_size], storage_precision=BFloat16) for i in range(self.table_size): bf16_params[i] = 1.1**i conv_vx = Conversion(TableLoad(bf16_params, vx), precision=ML_Binary32, tag="conv_vx", debug=debug_multi) result = conv_vx scheme = Return(result, precision=self.precision, debug=debug_multi) return scheme
def generate_1d_table(dim, storage_precision, tag, value_gen=lambda index: None, empty=False, const=True): """ generate a 1D ML_NewTable by using the given value generator @p value_gen """ gen_table = ML_NewTable(dimensions=[dim], storage_precision=storage_precision, tag=tag, const=const, empty=empty) for i in range(dim): gen_table[i] = value_gen(i) return gen_table
def generate_2d_table(dim0, dim1, storage_precision, tag, value_gen=(lambda index0: None), const=True): """ generate a 2D ML_NewTable by using the given value generator @p value_gen, values are generated one row at a time (rather than cell by cell) """ gen_table = ML_NewTable(dimensions=[dim0, dim1], storage_precision=storage_precision, const=const, tag=tag) for i0 in range(dim0): row_values = value_gen(i0) for i1 in range(dim1): gen_table[i0][i1] = row_values[i1] return gen_table
def generate_2d_multi_table(size_offset_list, dim1, storage_precision, tag, value_gen=lambda table_index, sub_row_index: None): """ generate a 2D multi-array stored in a ML_NewTable. The multi-array dimensions are defined by the (size, offset) pairs in size_offset_list for the first dimension and @p dim1 for the second dimension. Table value are obtained by using the given value generator @p value_gen, values are generated one row at a time (rather than cell by cell) """ # table first dimension is the sum of each sub-array size dim0 = sum(size_offset_list[sub_id][0] for sub_id in range(size_offset_list.dimensions[0])) gen_table = ML_NewTable(dimensions=[dim0, dim1], storage_precision=storage_precision, tag=tag) for table_index, (size, offset) in enumerate(size_offset_list): for i0 in range(size): row_values = value_gen(table_index, i0) for i1 in range(dim1): gen_table[offset + i0][i1] = row_values[i1] return gen_table
def generate_scalar_scheme(self, vx, inline_select=False): Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) # r_interval = Interval(0, 1.0) index_size = 3 r_interval = Interval(-2**(-index_size), 2**-index_size) local_ulp = sup(ulp(2**r_interval, self.precision)) Log.report(Log.Info, "ulp: ", local_ulp) error_goal = S2**-1 * local_ulp Log.report(Log.Info, "error goal: ", error_goal) sollya_precision = { ML_Binary32: sollya.binary32, ML_Binary64: sollya.binary64 }[self.precision] int_precision = { ML_Binary32: ML_Int32, ML_Binary64: ML_Int64 }[self.precision] # Argument Reduction # r = x - floor(x), r >= 0 vx_floor = Floor(vx, precision=self.precision, tag='vx_floor', debug=debug_multi) vx_int = Conversion(vx_floor, precision=int_precision, tag="vx_int", debug=debug_multi) vx_intf = vx_floor # Conversion(vx_int, precision = self.precision) vx_r = vx - vx_intf r_hi = NearestInteger(vx_r * 2**index_size, precision=self.precision, tag="r_hi", debug=debug_multi) # clamping r_hi_int within table-size to make sure # it does not exceeds hi_part_table when used to index it r_hi_int = Max( Min( Conversion(r_hi, precision=int_precision, tag="r_hi_int", debug=debug_multi), 2**index_size + 1), 0) r_lo = vx_r - r_hi * 2**-index_size r_lo.set_attributes(tag="r_lo", debug=debug_multi) vx_r.set_attributes(tag="vx_r", debug=debug_multi) degree = sup(guessdegree(2**(sollya.x), r_interval, error_goal)) + 2 precision_list = [1] + [self.precision] * degree exp_X = ExponentInsertion(vx_int, tag="exp_X", debug=debug_multi, precision=self.precision) #Polynomial Approx polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly_object, poly_error = Polynomial.build_from_approximation_with_error( 2**(sollya.x) - 1, degree, precision_list, r_interval, sollya.absolute) Log.report(Log.Info, "Poly : %s" % poly_object) Log.report(Log.Info, "poly_error : ", poly_error) poly = polynomial_scheme_builder(poly_object.sub_poly(start_index=1), r_lo, unified_precision=self.precision) poly.set_attributes(tag="poly", debug=debug_multi) hi_part_table = ML_NewTable(dimensions=[2**index_size + 1], storage_precision=self.precision, tag=self.uniquify_name("exp2_table"), const=True) for i in range(2**index_size + 1): input_value = i * 2**-index_size tab_value = self.precision.round_sollya_object( sollya.SollyaObject(2)**(input_value)) hi_part_table[i] = tab_value hi_part_value = TableLoad(hi_part_table, r_hi_int, precision=self.precision, tag="hi_part_value", debug=debug_multi) #Handling special cases oflow_bound = Constant(self.precision.get_emax() + 1, precision=self.precision) subnormal_bound = self.precision.get_emin_subnormal() uflow_bound = self.precision.get_emin_normal() Log.report(Log.Info, "oflow : ", oflow_bound) #print "uflow : ", uflow_bound #print "sub : ", subnormal_bound test_overflow = Comparison(vx, oflow_bound, specifier=Comparison.GreaterOrEqual) test_overflow.set_attributes(tag="oflow_test", debug=debug_multi, likely=False, precision=ML_Bool) test_underflow = Comparison(vx, uflow_bound, specifier=Comparison.Less) test_underflow.set_attributes(tag="uflow_test", debug=debug_multi, likely=False, precision=ML_Bool) test_subnormal = Comparison(vx, subnormal_bound, specifier=Comparison.Greater) test_subnormal.set_attributes(tag="sub_test", debug=debug_multi, likely=False, precision=ML_Bool) subnormal_offset = -(uflow_bound - vx_int) subnormal_offset.set_attributes(tag="offset", debug=debug_multi) exp_offset = ExponentInsertion(subnormal_offset, precision=self.precision, debug=debug_multi, tag="exp_offset") exp_min = ExponentInsertion(uflow_bound, precision=self.precision, debug=debug_multi, tag="exp_min") subnormal_result = hi_part_value * exp_offset * exp_min * poly + hi_part_value * exp_offset * exp_min test_std = LogicalOr(test_overflow, test_underflow, precision=ML_Bool, tag="std_test", likely=False, debug=debug_multi) #Reconstruction result = hi_part_value * exp_X * poly + hi_part_value * exp_X result.set_attributes(tag="result", debug=debug_multi) C0 = Constant(0, precision=self.precision) if inline_select: scheme = Select( test_std, Select(test_overflow, FP_PlusInfty(self.precision), Select( test_subnormal, subnormal_result, C0, )), result, ) return scheme else: return_inf = Return(FP_PlusInfty(self.precision)) return_C0 = Return(C0) return_sub = Return(subnormal_result) return_std = Return(result) non_std_statement = Statement( ConditionBlock( test_overflow, return_inf, ConditionBlock(test_subnormal, return_sub, return_C0))) scheme = Statement( ConditionBlock(test_std, non_std_statement, return_std)) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.precision) sollya_precision = self.precision.sollya_object # constant computation invlog2 = round(1 / log(2), sollya_precision, sollya.RN) invlog2_cst = Constant(invlog2, precision=self.precision) #v_log2_hi = round(log(2), 16, sollya.RN) #v_log2_lo = round(log(2) - v_log2_hi, sollya_precision, sollya.RN) #log2_hi = Constant(v_log2_hi, precision = self.precision, tag = "log2_hi") #log2_lo = Constant(v_log2_lo, precision = self.precision, tag = "log2_lo") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) v_log2_hi = round( log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) v_log2_lo = round( log(2) - v_log2_hi, self.precision.sollya_object, sollya.RN) log2_hi = Constant(v_log2_hi, precision=self.precision, tag="log2_hi") log2_lo = Constant(v_log2_lo, precision=self.precision, tag="log2_lo") vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debug_multi) int_precision = self.precision.get_integer_format() # table creation table_index_size = 7 log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, tag=self.uniquify_name("inv_table")) log_table[0][0] = 0.0 log_table[0][1] = 0.0 # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) integer_precision = { ML_Binary32: ML_UInt32, ML_Binary64: ML_UInt64 }[self.precision] for i in range(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = inv_approx_table[ i] # (1.0 + (inv_approx_table[i][0] / S2**9) ) * S2**-1 value_high = round( log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round( log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low def compute_log(_vx, exp_corr_factor=None): _vx_mant = MantissaExtraction(_vx, tag="_vx_mant", debug=debug_multi, precision=self.precision) _vx_exp = ExponentExtraction(_vx, tag="_vx_exp", debug=debug_multi) table_index = BitLogicAnd(BitLogicRightShift( TypeCast(_vx_mant, precision=int_precision, debug=debug_multi), self.precision.get_field_size() - 7, debug=debug_multi), 0x7f, tag="table_index", debug=debug_multi) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd( TypeCast(ReciprocalSeed(_vx_mant, precision=self.precision, tag="seed", debug=debug_multi, silent=True), precision=integer_precision), Constant(-2, precision=integer_precision), precision=integer_precision), precision=self.precision, tag="pre_arg_red_index", debug=debug_multi) arg_red_index = Select(Equal(table_index, 0), 1.0, pre_arg_red_index) #_red_vx = arg_red_index * _vx_mant - 1.0 _red_vx = FusedMultiplyAdd(arg_red_index, _vx_mant, 1.0, specifier=FusedMultiplyAdd.Subtract) _red_vx.set_attributes(tag="_red_vx", debug=debug_multi) inv_err = S2**-7 red_interval = Interval(1 - inv_err, 1 + inv_err) # return in case of standard (non-special) input _log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_multi) _log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_multi) Log.report(Log.Verbose, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree, [1] + [self.precision] * (poly_degree), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index=1) Log.report(Log.Verbose, "generating polynomial evaluation scheme") #_poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, _red_vx, unified_precision = self.precision) _poly = PolynomialSchemeEvaluator.generate_estrin_scheme( poly_object, _red_vx, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_multi) corr_exp = Conversion( _vx_exp if exp_corr_factor == None else _vx_exp + exp_corr_factor, precision=self.precision) split_red_vx = Split(_red_vx, precision=ML_DoubleDouble, tag="split_red_vx", debug=debug_multi) red_vx_hi = split_red_vx.hi red_vx_lo = split_red_vx.lo # result = _red_vx * poly - log_inv_hi - log_inv_lo + _vx_exp * log2_hi + _vx_exp * log2_lo pre_result = -_log_inv_hi + (_red_vx + (_red_vx * _poly + (corr_exp * log2_lo - _log_inv_lo))) pre_result.set_attributes(tag="pre_result", debug=debug_multi) exact_log2_hi_exp = corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_exp", debug=debug_multi) cancel_part = (corr_exp * log2_hi - _log_inv_hi) cancel_part.set_attributes(tag="cancel_part", debug=debug_multi) sub_part = red_vx_hi + cancel_part sub_part.set_attributes(tag="sub_part", debug=debug_multi) #result_one_low_part = (red_vx_hi * _poly + (red_vx_lo + (red_vx_lo * _poly + (corr_exp * log2_lo - _log_inv_lo)))) result_one_low_part = ((red_vx_lo + (red_vx_lo * _poly + (corr_exp * log2_lo - _log_inv_lo)))) result_one_low_part.set_attributes(tag="result_one_low_part", debug=debug_multi) _result_one = ( (sub_part) + red_vx_hi * _poly) + result_one_low_part return exact_log2_hi_exp + pre_result, _poly, _log_inv_lo, _log_inv_hi, _red_vx, _result_one result, poly, log_inv_lo, log_inv_hi, red_vx, new_result_one = compute_log( vx) result.set_attributes(tag="result", debug=debug_multi) new_result_one.set_attributes(tag="new_result_one", debug=debug_multi) neg_input = Comparison(vx, 0, likely=False, specifier=Comparison.Less, debug=debug_multi, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debug_multi, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debug_multi, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debug_multi, tag="vx_subnormal") vx_zero = Test(vx, specifier=Test.IsZero, likely=False, debug=debug_multi, tag="vx_zero") exp_mone = Equal(vx_exp, -1, tag="exp_minus_one", debug=debug_multi, likely=False) vx_one = Equal(vx, 1.0, tag="vx_one", likely=False, debug=debug_multi) # exp=-1 case Log.report(Log.Verbose, "managing exp=-1 case") result2 = (-log_inv_hi - log2_hi) + ( (red_vx + poly * red_vx) - log2_lo - log_inv_lo) result2.set_attributes(tag="result2", debug=debug_multi) m100 = -100 S2100 = Constant(S2**100, precision=self.precision) result_subnormal, _, _, _, _, _ = compute_log(vx * S2100, exp_corr_factor=m100) Log.report(Log.Verbose, "managing close to 1.0 cases") one_err = S2**-7 approx_interval_one = Interval(-one_err, one_err) red_vx_one = vx - 1.0 poly_degree_one = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval_one, S2** -(self.precision.get_field_size() + 1))) + 1 poly_object_one = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree_one, [self.precision] * (poly_degree_one + 1), approx_interval_one, sollya.absolute).sub_poly(start_index=1) poly_one = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object_one, red_vx_one, unified_precision=self.precision) poly_one.set_attributes(tag="poly_one", debug=debug_multi) result_one = red_vx_one + red_vx_one * poly_one cond_one = (vx < (1 + one_err)) & (vx > (1 - one_err)) cond_one.set_attributes(tag="cond_one", debug=debug_multi, likely=False) # main scheme pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, ConditionBlock( vx_zero, Statement( ClearException(), Raise(ML_FPE_DivideByZero), Return(FP_MinusInfty(self.precision)), ), Return(result_subnormal)), ConditionBlock( vx_one, Statement( ClearException(), Return(FP_PlusZero(self.precision)), ), ConditionBlock(exp_mone, Return(result2), Return(result)) #ConditionBlock(cond_one, #Return(new_result_one), #ConditionBlock(exp_mone, #Return(result2), #Return(result) #) #) )))) scheme = pre_scheme return scheme
def generate_scheme(self): ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = self.precision # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) # rounding mode input rnd_mode = self.implementation.add_input_signal( "rnd_mode", rnd_mode_format) # size of most significant table index (for linear slope tabulation) alpha = self.alpha # 6 # size of medium significant table index (for initial value table index LSB) beta = self.beta # 5 # size of least significant table index (for linear offset tabulation) gamma = self.gamma # 5 guard_bits = self.guard_bits # 3 vx.set_interval(self.interval) range_hi = sollya.sup(self.interval) range_lo = sollya.inf(self.interval) f_hi = self.function(range_hi) f_lo = self.function(range_lo) # fixed by format used for reduced_x range_size = range_hi - range_lo range_size_log2 = int(sollya.log2(range_size)) assert 2**range_size_log2 == range_size print("range_size_log2={}".format(range_size_log2)) reduced_x = Conversion(BitLogicRightShift(vx - range_lo, range_size_log2), precision=fixed_point(0, alpha + beta + gamma, signed=False), tag="reduced_x", debug=debug_fixed) alpha_index = get_fixed_slice(reduced_x, 0, alpha - 1, align_hi=FixedPointPosition.FromMSBToLSB, align_lo=FixedPointPosition.FromMSBToLSB, tag="alpha_index", debug=debug_std) gamma_index = get_fixed_slice(reduced_x, gamma - 1, 0, align_hi=FixedPointPosition.FromLSBToLSB, align_lo=FixedPointPosition.FromLSBToLSB, tag="gamma_index", debug=debug_std) beta_index = get_fixed_slice(reduced_x, alpha, gamma, align_hi=FixedPointPosition.FromMSBToLSB, align_lo=FixedPointPosition.FromLSBToLSB, tag="beta_index", debug=debug_std) # Assuming monotonic function f_absmax = max(abs(f_hi), abs(f_lo)) f_absmin = min(abs(f_hi), abs(f_lo)) f_msb = int(sollya.ceil(sollya.log2(f_absmax))) + 1 f_lsb = int(sollya.floor(sollya.log2(f_absmin))) storage_lsb = f_lsb - io_precision.get_bit_size() - guard_bits f_int_size = f_msb f_frac_size = -storage_lsb storage_format = fixed_point(f_int_size, f_frac_size, signed=False) Log.report(Log.Info, "storage_format is {}".format(storage_format)) # table of initial value index tiv_index = Concatenation(alpha_index, beta_index, tag="tiv_index", debug=debug_std) # table of offset value index to_index = Concatenation(alpha_index, gamma_index, tag="to_index", debug=debug_std) tiv_index_size = alpha + beta to_index_size = alpha + gamma Log.report(Log.Info, "initial table structures") table_iv = ML_NewTable(dimensions=[2**tiv_index_size], storage_precision=storage_format, tag="tiv") table_offset = ML_NewTable(dimensions=[2**to_index_size], storage_precision=storage_format, tag="to") slope_table = [None] * (2**alpha) slope_delta = 1.0 / sollya.SollyaObject(2**alpha) delta_u = range_size * slope_delta * 2**-15 Log.report(Log.Info, "computing slope value") for i in range(2**alpha): # slope is computed at the middle of range_size interval slope_x = range_lo + (i + 0.5) * range_size * slope_delta # TODO: gross approximation of derivatives f_xpu = self.function(slope_x + delta_u / 2) f_xmu = self.function(slope_x - delta_u / 2) slope = (f_xpu - f_xmu) / delta_u slope_table[i] = slope range_rcp_steps = 1.0 / sollya.SollyaObject(2**tiv_index_size) Log.report(Log.Info, "computing value for initial-value table") for i in range(2**tiv_index_size): slope_index = i / 2**beta iv_x = range_lo + i * range_rcp_steps * range_size offset_x = 0.5 * range_rcp_steps * range_size # initial value is computed so that the piecewise linear # approximation intersects the function at iv_x + offset_x iv_y = self.function( iv_x + offset_x) - offset_x * slope_table[int(slope_index)] initial_value = storage_format.round_sollya_object(iv_y) table_iv[i] = initial_value # determining table of initial value interval tiv_min = table_iv[0] tiv_max = table_iv[0] for i in range(1, 2**tiv_index_size): tiv_min = min(tiv_min, table_iv[i]) tiv_max = max(tiv_max, table_iv[i]) table_iv.set_interval(Interval(tiv_min, tiv_max)) offset_step = range_size / S2**(alpha + beta + gamma) for i in range(2**alpha): Log.report(Log.Info, "computing offset value for sub-table {}".format(i)) for j in range(2**gamma): to_i = i * 2**gamma + j offset = slope_table[i] * j * offset_step table_offset[to_i] = offset # determining table of offset interval to_min = table_offset[0] to_max = table_offset[0] for i in range(1, 2**(alpha + gamma)): to_min = min(to_min, table_offset[i]) to_max = max(to_max, table_offset[i]) offset_interval = Interval(to_min, to_max) table_offset.set_interval(offset_interval) initial_value = TableLoad(table_iv, tiv_index, precision=storage_format, tag="initial_value", debug=debug_fixed) offset_precision = get_fixed_type_from_interval(offset_interval, 16) print("offset_precision is {} ({} bits)".format( offset_precision, offset_precision.get_bit_size())) table_offset.get_precision().storage_precision = offset_precision # rounding table value for i in range(1, 2**(alpha + gamma)): table_offset[i] = offset_precision.round_sollya_object( table_offset[i]) offset_value = TableLoad(table_offset, to_index, precision=offset_precision, tag="offset_value", debug=debug_fixed) Log.report( Log.Verbose, "initial_value's interval: {}, offset_value's interval: {}".format( evaluate_range(initial_value), evaluate_range(offset_value))) final_add = initial_value + offset_value round_bit = final_add # + FixedPointPosition(final_add, io_precision.get_bit_size(), align=FixedPointPosition.FromMSBToLSB) vr_out = Conversion(initial_value + offset_value, precision=io_precision, tag="vr_out", debug=debug_fixed) self.implementation.add_output_signal("vr_out", vr_out) # Approximation error evaluation approx_error = 0.0 for i in range(2**alpha): for j in range(2**beta): tiv_i = (i * 2**beta + j) # = range_lo + tiv_i * range_rcp_steps * range_size iv = table_iv[tiv_i] for k in range(2**gamma): to_i = i * 2**gamma + k offset = table_offset[to_i] approx_value = offset + iv table_x = range_lo + range_size * ( (i * 2**beta + j) * 2**gamma + k) / S2**(alpha + beta + gamma) local_error = abs(1 / (table_x) - approx_value) approx_error = max(approx_error, local_error) error_log2 = float(sollya.log2(approx_error)) print("approx_error is {}, error_log2 is {}".format( float(approx_error), error_log2)) # table size table_iv_size = 2**(alpha + beta) table_offset_size = 2**(alpha + gamma) print("tables' size are {} entries".format(table_iv_size + table_offset_size)) return [self.implementation]
def generate_scheme(self): memory_limit = 2500 # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = input_var kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) ### Constants computations ### v_log2_hi = nearestint(log(2) * 2**-52) * 2**52 v_log2_lo = round(log(2) - v_log2_hi, 64 + 53, sollya.RN) log2_hi = Constant(v_log2_hi, precision=self.precision, tag="log2_hi") log2_lo = Constant(v_log2_lo, precision=self.precision, tag="log2_lo") print "\n\033[1mSearch parameters for the argument reduction:\033[0m (this can take a while)" arg_reduc = self.generate_argument_reduction(memory_limit) print "\n\033[1mArgument reduction found:\033[0m [({},{}),({},{})] -> polynomials of degree {},{}, using {} bytes of memory".format( arg_reduc['size1'], arg_reduc['prec1'], arg_reduc['size2'], arg_reduc['prec2'], arg_reduc['degree_poly1'], arg_reduc['degree_poly2'], arg_reduc['sizeof_tables']) print "\n\033[1mGenerate the first logarithm table:\033[0m containing {} elements, using {} bytes of memory".format( arg_reduc['length_table1'], arg_reduc['sizeof_table1']) inv_table_1 = ML_NewTable( dimensions=[arg_reduc['length_table1']], storage_precision=ML_Custom_FixedPoint_Format( 1, arg_reduc['prec1'], False), tag=self.uniquify_name("inv_table_1")) log_table_1 = ML_NewTable( dimensions=[arg_reduc['length_table1']], storage_precision=ML_Custom_FixedPoint_Format(11, 128 - 11, False), tag=self.uniquify_name("log_table_1")) for i in range(0, arg_reduc['length_table1'] - 1): x1 = 1 + i / S2 * arg_reduc['size1'] inv_x1 = ceil(S2**arg_reduc['prec1'] / x1) * S2**arg_reduc['prec1'] log_x1 = floor(log(x1) * S2**(128 - 11)) * S2**(11 - 128) inv_table_1[ i] = inv_x1 #Constant(inv_x1, precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec1'], False)) log_table_1[ i] = log_x1 #Constant(log_x1, precision = ML_Custom_FixedPoint_Format(11, 128-11, False)) print "\n\033[1mGenerate the second logarithm table:\033[0m containing {} elements, using {} bytes of memory".format( arg_reduc['length_table2'], arg_reduc['sizeof_table2']) inv_table_2 = ML_NewTable( dimensions=[arg_reduc['length_table2']], storage_precision=ML_Custom_FixedPoint_Format( 1, arg_reduc['prec2'], False), tag=self.uniquify_name("inv_table_2")) log_table_2 = ML_NewTable( dimensions=[arg_reduc['length_table2']], storage_precision=ML_Custom_FixedPoint_Format(11, 128 - 11, False), tag=self.uniquify_name("log_table_2")) for i in range(0, arg_reduc['length_table2'] - 1): y1 = 1 + i / S2**arg_reduc['size2'] inv_y1 = ceil(S2**arg_reduc['prec2'] / x1) * S2**arg_reduc['prec2'] log_y1 = floor(log(inv_y1) * S2**(128 - 11)) * S2**(11 - 128) inv_table_2[ i] = inv_y1 #Constant(inv_y1, precision = ML_Custom_FixedPoint_Format(1, arg_reduc['prec2'], False)) log_table_2[ i] = log_y1 #Constant(log_y1, precision = ML_Custom_FixedPoint_Format(11, 128-11, False)) ### Evaluation Scheme ### print "\n\033[1mGenerate the evaluation scheme:\033[0m" input_var = self.implementation.add_input_variable( "input_var", self.precision) ve = ExponentExtraction(input_var, tag="x_exponent", debug=debugd) vx = MantissaExtraction(input_var, tag="x_mantissa", precision=ML_Custom_FixedPoint_Format( 0, 52, False), debug=debug_lftolx) #vx = MantissaExtraction(input_var, tag = "x_mantissa", precision = self.precision, debug = debug_lftolx) print "filtering and handling special cases" test_is_special_cases = LogicalNot( Test(input_var, specifier=Test.IsIEEENormalPositive, likely=True, debug=debugd, tag="is_special_cases")) handling_special_cases = Statement( ConditionBlock( Test(input_var, specifier=Test.IsSignalingNaN, debug=True), ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))), ConditionBlock(Test(input_var, specifier=Test.IsNaN, debug=True), Return(input_var)) #, # TODO: add tests for x == 0 (raise DivideByZero, return -Inf), x < 0 (raise InvalidOperation, return qNaN) # all that remains is x is a subnormal positive #Statement( # ReferenceAssign(Dereference(ve), Subtraction(ve, Subtraction(CountLeadingZeros(input_var, tag = 'subnormal_clz', precision = ve.get_precision()), Constant(12, precision = ve.get_precision())))), # ReferenceAssign(Dereference(vx), BitLogicLeftShift(vx, Addition(CountLeadingZeros(input_var, tag = 'subnormal_clz', precision = ve.get_precision()), Constant(1, precision = ve.get_precision())))) #) ) print "doing the argument reduction" v_dx = vx v_x1 = Conversion(v_dx, tag='x1', precision=ML_Custom_FixedPoint_Format( 0, arg_reduc['size1'], False), rounding_mode=ML_RoundTowardMinusInfty) v_index_x = TypeCast( v_x1, tag='index_x', precision=ML_Int32 ) #ML_Custom_FixedPoint_Format(v_x1.get_precision().get_c_bit_size(), 0, False)) v_inv_x = TableLoad(inv_table_1, v_index_x, tag='inv_x') v_x = Addition(v_dx, 1, tag='x', precision=ML_Custom_FixedPoint_Format(1, 52, False)) v_dy = Multiplication(v_x, v_inv_x, tag='dy', precision=ML_Custom_FixedPoint_Format( 0, 52 + arg_reduc['prec1'], False)) v_y1 = Conversion(v_dy, tag='y1', precision=ML_Custom_FixedPoint_Format( 0, arg_reduc['size2'], False), rounding_mode=ML_RoundTowardMinusInfty) v_index_y = TypeCast( v_y1, tag='index_y', precision=ML_Int32 ) #ML_Custom_FixedPoint_Format(v_y1.get_precision().get_c_bit_size(), 0, False)) v_inv_y = TableLoad(inv_table_2, v_index_y, tag='inv_y') v_y = Addition(v_dy, 1, tag='y', precision=ML_Custom_FixedPoint_Format( 1, 52 + arg_reduc['prec2'], False)) # note that we limit the number of bits used to represent dz to 64. # we proved during the arg reduction that we can do that (sup(out_interval) < 2^(64-52-prec1-prec2)) v_dz = Multiplication( v_y, v_inv_y, tag='z', precision=ML_Custom_FixedPoint_Format( 64 - 52 - arg_reduc['prec1'] - arg_reduc['prec2'], 52 + arg_reduc['prec1'] + arg_reduc['prec2'], False)) # reduce the number of bits used to represent dz. we can do that print "doing the first polynomial evaluation" global_poly1_object = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, arg_reduc['degree_poly1'] - 1, [64] * (arg_reduc['degree_poly1']), arg_reduc['out_interval'], fixed, sollya.absolute) poly1_object = global_poly1_object.sub_poly(start_index=1) print global_poly1_object print poly1_object poly1 = PolynomialSchemeEvaluator.generate_horner_scheme( poly1_object, v_dz, unified_precision=v_dz.get_precision()) return ConditionBlock(test_is_special_cases, handling_special_cases, Return(poly1)) #approx_interval = Interval(0, 27021597764222975*S2**-61) #poly_degree = 1+sup(guessdegree(log(1+x)/x, approx_interval, S2**-(self.precision.get_field_size()))) #global_poly_object = Polynomial.build_from_approximation(log(1+x)/x, poly_degree, [1] + [self.precision]*(poly_degree), approx_interval, sollya.absolute) #poly_object = global_poly_object.sub_poly(start_index = 1) #_poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, _red_vx, unified_precision = self.precision) #_poly.set_attributes(tag = "poly", debug = debug_lftolx) """
def generate_scheme(self): """ generate scheme """ vx = self.implementation.add_input_variable("x", self.get_input_precision()) # retrieving processor inverse approximation table lo_bound_global = SollyaObject(0.0) hi_bound_global = SollyaObject(0.75) approx_interval = Interval(lo_bound_global, hi_bound_global) approx_interval_size = hi_bound_global - lo_bound_global # table creation table_index_size = 7 field_index_size = 2 exp_index_size = table_index_size - field_index_size table_size = 2**table_index_size table_index_range = range(table_size) local_degree = 9 coeff_table = ML_NewTable(dimensions=[table_size, local_degree], storage_precision=self.precision) exp_lo = 2**exp_index_size for i in table_index_range: lo_bound = (1.0 + (i % 2**field_index_size) * S2**-field_index_size ) * S2**(i / 2**field_index_size - exp_lo) hi_bound = (1.0 + ((i % 2**field_index_size) + 1) * S2**-field_index_size ) * S2**(i / 2**field_index_size - exp_lo) local_approx_interval = Interval(lo_bound, hi_bound) local_poly_object, local_error = Polynomial.build_from_approximation_with_error( acos(1 - sollya.x), local_degree, [self.precision] * (local_degree + 1), local_approx_interval, sollya.absolute) local_error = int( log2(sup(abs(local_error / acos(1 - local_approx_interval))))) coeff_table for d in range(local_degree): coeff_table[i][d] = sollya.coeff( local_poly_object.get_sollya_object(), d) table_index = BitLogicRightShift( vx, vx.get_precision().get_field_size() - field_index_size) - (exp_lo << field_index_size) print "building mathematical polynomial" poly_degree = sup( sollya.guessdegree(acos(x), approx_interval, S2**-(self.precision.get_field_size()))) print "guessed polynomial degree: ", int(poly_degree) #global_poly_object = Polynomial.build_from_approximation(log10(1+x)/x, poly_degree, [self.precision]*(poly_degree+1), approx_interval, absolute) print "generating polynomial evaluation scheme" #_poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, _red_vx, unified_precision = self.precision) # building eval error map #eval_error_map = { # red_vx: Variable("red_vx", precision = self.precision, interval = red_vx.get_interval()), # log_inv_hi: Variable("log_inv_hi", precision = self.precision, interval = table_high_interval), # log_inv_lo: Variable("log_inv_lo", precision = self.precision, interval = table_low_interval), #} # computing gappa error #poly_eval_error = self.get_eval_error(result, eval_error_map) # main scheme print "MDL scheme" scheme = Statement(Return(vx)) return scheme
def generate_payne_hanek(vx, frac_pi, precision, n=100, k=4, chunk_num=None, debug=False): """ generate payne and hanek argument reduction for frac_pi * variable """ sollya.roundingwarnings = sollya.off debug_precision = debug_multi int_precision = {ML_Binary32: ML_Int32, ML_Binary64: ML_Int64}[precision] p = precision.get_field_size() # weight of the most significant digit of the constant cst_msb = floor(log2(abs(frac_pi))) # length of exponent range which must be covered by the approximation # of the constant cst_exp_range = cst_msb - precision.get_emin_subnormal() + 1 # chunk size has to be so than multiplication by a splitted <v> # (vx_hi or vx_lo) is exact chunk_size = precision.get_field_size() / 2 - 2 chunk_number = int(ceil((cst_exp_range + chunk_size - 1) / chunk_size)) scaling_factor = S2**-(chunk_size / 2) chunk_size_cst = Constant(chunk_size, precision=ML_Int32) cst_msb_node = Constant(cst_msb, precision=ML_Int32) # Saving sollya's global precision old_global_prec = sollya.settings.prec sollya.settings.prec(cst_exp_range + n) # table to store chunk of constant multiplicand cst_table = ML_NewTable(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_cst_table") # table to store sqrt(scaling_factor) corresponding to the # cst multiplicand chunks scale_table = ML_NewTable(dimensions=[chunk_number, 1], storage_precision=precision, tag="PH_scale_table") tmp_cst = frac_pi # cst_table stores normalized constant chunks (they have been # scale back to close to 1.0 interval) # # scale_table stores the scaling factors corresponding to the # denormalization of cst_table coefficients # this loop divide the digits of frac_pi into chunks # the chunk lsb weight is given by a shift from # cst_msb, multiple of the chunk index for i in range(chunk_number): value_div_factor = S2**(chunk_size * (i + 1) - cst_msb) local_cst = int(tmp_cst * value_div_factor) / value_div_factor local_scale = (scaling_factor**i) # storing scaled constant chunks cst_table[i][0] = local_cst / (local_scale**2) scale_table[i][0] = local_scale # Updating constant value tmp_cst = tmp_cst - local_cst # Computing which part of the constant we do not need to multiply # In the following comments, vi represents the bit of frac_pi of weight 2**-i # Bits vi so that i <= (vx_exp - p + 1 -k) are not needed, because they result # in a multiple of 2pi and do not contribute to trig functions. vx_exp = ExponentExtraction( vx, precision=vx.get_precision().get_integer_format()) vx_exp = Conversion(vx_exp, precision=ML_Int32) msb_exp = -(vx_exp - p + 1 - k) msb_exp.set_attributes(tag="msb_exp", debug=debug_multi) msb_exp = Conversion(msb_exp, precision=ML_Int32) # Select the highest index where the reduction should start msb_index = Select(cst_msb_node < msb_exp, 0, (cst_msb_node - msb_exp) / chunk_size_cst) msb_index.set_attributes(tag="msb_index", debug=debug_multi) # For a desired accuracy of 2**-n, bits vi so that i >= (vx_exp + n + 4) are not needed, because they contribute less than # 2**-n to the result lsb_exp = -(vx_exp + n + 4) lsb_exp.set_attributes(tag="lsb_exp", debug=debug_multi) lsb_exp = Conversion(lsb_exp, precision=ML_Int32) # Index of the corresponding chunk lsb_index = (cst_msb_node - lsb_exp) / chunk_size_cst lsb_index.set_attributes(tag="lsb_index", debug=debug_multi) # Splitting vx half_size = precision.get_field_size() / 2 + 1 # hi part (most significant digit) of vx input vx_hi = TypeCast(BitLogicAnd( TypeCast(vx, precision=int_precision), Constant(~int(2**half_size - 1), precision=int_precision)), precision=precision) vx_hi.set_attributes(tag="vx_hi_ph") #, debug = debug_multi) vx_lo = vx - vx_hi vx_lo.set_attributes(tag="vx_lo_ph") #, debug = debug_multi) # loop iterator variable vi = Variable("i", precision=ML_Int32, var_type=Variable.Local) # step scaling factor half_scaling = Constant(S2**(-chunk_size / 2), precision=precision) i1 = Constant(1, precision=ML_Int32) # accumulator to the output precision acc = Variable("acc", precision=precision, var_type=Variable.Local) # integer accumulator acc_int = Variable("acc_int", precision=int_precision, var_type=Variable.Local) init_loop = Statement( vx_hi, vx_lo, ReferenceAssign(vi, msb_index), ReferenceAssign(acc, Constant(0, precision=precision)), ReferenceAssign(acc_int, Constant(0, precision=int_precision)), ) cst_load = TableLoad(cst_table, vi, 0, tag="cst_load", debug=debug_precision) sca_load = TableLoad(scale_table, vi, 0, tag="sca_load", debug=debug_precision) # loop body # hi_mult = vx_hi * <scale_factor> * <cst> hi_mult = (vx_hi * sca_load) * (cst_load * sca_load) hi_mult.set_attributes(tag="hi_mult", debug=debug_precision) pre_hi_mult_int = NearestInteger(hi_mult, precision=int_precision, tag="hi_mult_int", debug=(debuglld if debug else None)) hi_mult_int_f = Conversion(pre_hi_mult_int, precision=precision, tag="hi_mult_int_f", debug=debug_precision) pre_hi_mult_red = (hi_mult - hi_mult_int_f).modify_attributes( tag="hi_mult_red", debug=debug_precision) # for the first chunks (vx_hi * <constant chunk>) exceeds 2**k+1 and may be # discard (whereas it may lead to overflow during integer conversion pre_exclude_hi = ((cst_msb_node - (vi + i1) * chunk_size + i1) + (vx_exp + Constant(-half_size + 1, precision=ML_Int32)) ).modify_attributes(tag="pre_exclude_hi", debug=(debugd if debug else None)) pre_exclude_hi.propagate_precision(ML_Int32, [cst_msb_node, vi, vx_exp, i1]) Ck = Constant(k, precision=ML_Int32) exclude_hi = pre_exclude_hi <= Ck exclude_hi.set_attributes(tag="exclude_hi", debug=debug_multi) hi_mult_red = Select(exclude_hi, pre_hi_mult_red, Constant(0, precision=precision)) hi_mult_int = Select(exclude_hi, pre_hi_mult_int, Constant(0, precision=int_precision)) # lo part of the chunk reduction lo_mult = (vx_lo * sca_load) * (cst_load * sca_load) lo_mult.set_attributes(tag="lo_mult") #, debug = debug_multi) lo_mult_int = NearestInteger(lo_mult, precision=int_precision, tag="lo_mult_int") #, debug = debug_multi lo_mult_int_f = Conversion(lo_mult_int, precision=precision, tag="lo_mult_int_f") #, debug = debug_multi) lo_mult_red = (lo_mult - lo_mult_int_f).modify_attributes( tag="lo_mult_red") #, debug = debug_multi) # accumulating fractional part acc_expr = (acc + hi_mult_red) + lo_mult_red # accumulating integer part int_expr = ((acc_int + hi_mult_int) + lo_mult_int) % 2**(k + 1) CF1 = Constant(1, precision=precision) CI1 = Constant(1, precision=int_precision) # extracting exceeding integer part in fractionnal accumulator acc_expr_int = NearestInteger(acc_expr, precision=int_precision) # normalizing integer and fractionnal accumulator by subtracting then # adding exceeding integer part normalization = Statement( ReferenceAssign( acc, acc_expr - Conversion(acc_expr_int, precision=precision)), ReferenceAssign(acc_int, int_expr + acc_expr_int), ) acc_expr.set_attributes(tag="acc_expr") #, debug = debug_multi) int_expr.set_attributes(tag="int_expr") #, debug = debug_multi) red_loop = Loop( init_loop, vi <= lsb_index, Statement(acc_expr, int_expr, normalization, ReferenceAssign(vi, vi + 1))) result = Statement(lsb_index, msb_index, red_loop) # restoring sollya's global precision sollya.settings.prec = old_global_prec return result, acc, acc_int
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = self.implementation.add_input_variable("x", self.precision) Log.report(Log.Info, "generating implementation scheme") if self.debug_flag: Log.report(Log.Info, "debug has been enabled") # local overloading of RaiseReturn operation def SincosRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) sollya_precision = self.precision.get_sollya_object() hi_precision = self.precision.get_field_size() - 8 cw_hi_precision = self.precision.get_field_size() - 4 ext_precision = { ML_Binary32: ML_Binary64, ML_Binary64: ML_Binary64 }[self.precision] int_precision = { ML_Binary32: ML_Int32, ML_Binary64: ML_Int64 }[self.precision] if self.precision is ML_Binary32: ph_bound = S2**10 else: ph_bound = S2**33 test_ph_bound = Comparison(vx, ph_bound, specifier=Comparison.GreaterOrEqual, precision=ML_Bool, likely=False) # argument reduction # m frac_pi_index = {ML_Binary32: 10, ML_Binary64: 14}[self.precision] C0 = Constant(0, precision=int_precision) C1 = Constant(1, precision=int_precision) C_offset = Constant(3 * S2**(frac_pi_index - 1), precision=int_precision) # 2^m / pi frac_pi = round(S2**frac_pi_index / pi, cw_hi_precision, sollya.RN) frac_pi_lo = round(S2**frac_pi_index / pi - frac_pi, sollya_precision, sollya.RN) # pi / 2^m, high part inv_frac_pi = round(pi / S2**frac_pi_index, cw_hi_precision, sollya.RN) # pi / 2^m, low part inv_frac_pi_lo = round(pi / S2**frac_pi_index - inv_frac_pi, sollya_precision, sollya.RN) # computing k vx.set_attributes(tag="vx", debug=debug_multi) vx_pi = Addition(Multiplication(vx, Constant(frac_pi, precision=self.precision), precision=self.precision), Multiplication(vx, Constant(frac_pi_lo, precision=self.precision), precision=self.precision), precision=self.precision, tag="vx_pi", debug=debug_multi) k = NearestInteger(vx_pi, precision=int_precision, tag="k", debug=debug_multi) # k in floating-point precision fk = Conversion(k, precision=self.precision, tag="fk", debug=debug_multi) inv_frac_pi_cst = Constant(inv_frac_pi, tag="inv_frac_pi", precision=self.precision, debug=debug_multi) inv_frac_pi_lo_cst = Constant(inv_frac_pi_lo, tag="inv_frac_pi_lo", precision=self.precision, debug=debug_multi) # Cody-Waite reduction red_coeff1 = Multiplication(fk, inv_frac_pi_cst, precision=self.precision, exact=True) red_coeff2 = Multiplication(Negation(fk, precision=self.precision), inv_frac_pi_lo_cst, precision=self.precision, exact=True) # Should be exact / Sterbenz' Lemma pre_sub_mul = Subtraction(vx, red_coeff1, precision=self.precision, exact=True) # Fast2Sum s = Addition(pre_sub_mul, red_coeff2, precision=self.precision, unbreakable=True, tag="s", debug=debug_multi) z = Subtraction(s, pre_sub_mul, precision=self.precision, unbreakable=True, tag="z", debug=debug_multi) t = Subtraction(red_coeff2, z, precision=self.precision, unbreakable=True, tag="t", debug=debug_multi) red_vx_std = Addition(s, t, precision=self.precision) red_vx_std.set_attributes(tag="red_vx_std", debug=debug_multi) # To compute sine we offset x by 3pi/2 # which means add 3 * S2^(frac_pi_index-1) to k if self.sin_output: Log.report(Log.Info, "Computing Sin") offset_k = Addition(k, C_offset, precision=int_precision, tag="offset_k") else: Log.report(Log.Info, "Computing Cos") offset_k = k modk = Variable("modk", precision=int_precision, var_type=Variable.Local) red_vx = Variable("red_vx", precision=self.precision, var_type=Variable.Local) # Faster modulo using bitwise logic modk_std = BitLogicAnd(offset_k, 2**(frac_pi_index + 1) - 1, precision=int_precision, tag="modk", debug=debug_multi) approx_interval = Interval(-pi / (S2**(frac_pi_index + 1)), pi / S2**(frac_pi_index + 1)) red_vx.set_interval(approx_interval) Log.report(Log.Info, "approx interval: %s\n" % approx_interval) Log.report(Log.Info, "building tabulated approximation for sin and cos") error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) # polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme table_index_size = frac_pi_index + 1 cos_table = ML_NewTable(dimensions=[2**table_index_size, 1], storage_precision=self.precision, tag=self.uniquify_name("cos_table")) for i in range(2**(frac_pi_index + 1)): local_x = i * pi / S2**frac_pi_index cos_local = round(cos(local_x), self.precision.get_sollya_object(), sollya.RN) cos_table[i][0] = cos_local sin_index = Modulo(modk + 2**(frac_pi_index - 1), 2**(frac_pi_index + 1), precision=int_precision, tag="sin_index") #, debug = debug_multi) tabulated_cos = TableLoad(cos_table, modk, C0, precision=self.precision, tag="tab_cos", debug=debug_multi) tabulated_sin = -TableLoad(cos_table, sin_index, C0, precision=self.precision, tag="tab_sin", debug=debug_multi) poly_degree_cos = sup( guessdegree(cos(sollya.x), approx_interval, S2** -self.precision.get_precision()) + 2) poly_degree_sin = sup( guessdegree( sin(sollya.x) / sollya.x, approx_interval, S2** -self.precision.get_precision()) + 2) poly_degree_cos_list = range(0, int(poly_degree_cos) + 3) poly_degree_sin_list = range(0, int(poly_degree_sin) + 3) # cosine polynomial: limiting first and second coefficient precision to 1-bit poly_cos_prec_list = [self.precision] * len(poly_degree_cos_list) # sine polynomial: limiting first coefficient precision to 1-bit poly_sin_prec_list = [self.precision] * len(poly_degree_sin_list) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) Log.report(Log.Info, "building mathematical polynomials for sin and cos") # Polynomial approximations Log.report(Log.Info, "cos") poly_object_cos, poly_error_cos = Polynomial.build_from_approximation_with_error( cos(sollya.x), poly_degree_cos_list, poly_cos_prec_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "sin") poly_object_sin, poly_error_sin = Polynomial.build_from_approximation_with_error( sin(sollya.x), poly_degree_sin_list, poly_sin_prec_list, approx_interval, sollya.absolute, error_function=error_function) Log.report( Log.Info, "poly error cos: {} / {:d}".format( poly_error_cos, int(sollya.log2(poly_error_cos)))) Log.report( Log.Info, "poly error sin: {0} / {1:d}".format( poly_error_sin, int(sollya.log2(poly_error_sin)))) Log.report(Log.Info, "poly cos : %s" % poly_object_cos) Log.report(Log.Info, "poly sin : %s" % poly_object_sin) # Polynomial evaluation scheme poly_cos = polynomial_scheme_builder( poly_object_cos.sub_poly(start_index=1), red_vx, unified_precision=self.precision) poly_sin = polynomial_scheme_builder( poly_object_sin.sub_poly(start_index=2), red_vx, unified_precision=self.precision) poly_cos.set_attributes(tag="poly_cos", debug=debug_multi) poly_sin.set_attributes(tag="poly_sin", debug=debug_multi, unbreakable=True) # TwoProductFMA mul_cos_x = tabulated_cos * poly_cos mul_cos_y = FusedMultiplyAdd(tabulated_cos, poly_cos, -mul_cos_x, precision=self.precision) mul_sin_x = tabulated_sin * poly_sin mul_sin_y = FusedMultiplyAdd(tabulated_sin, poly_sin, -mul_sin_x, precision=self.precision) mul_coeff_sin_hi = tabulated_sin * red_vx mul_coeff_sin_lo = FusedMultiplyAdd(tabulated_sin, red_vx, -mul_coeff_sin_hi) mul_cos = Addition(mul_cos_x, mul_cos_y, precision=self.precision, tag="mul_cos") #, debug = debug_multi) mul_sin = Negation(Addition(mul_sin_x, mul_sin_y, precision=self.precision), precision=self.precision, tag="mul_sin") #, debug = debug_multi) mul_coeff_sin = Negation(Addition(mul_coeff_sin_hi, mul_coeff_sin_lo, precision=self.precision), precision=self.precision, tag="mul_coeff_sin") #, debug = debug_multi) mul_cos_x.set_attributes( tag="mul_cos_x", precision=self.precision) #, debug = debug_multi) mul_cos_y.set_attributes( tag="mul_cos_y", precision=self.precision) #, debug = debug_multi) mul_sin_x.set_attributes( tag="mul_sin_x", precision=self.precision) #, debug = debug_multi) mul_sin_y.set_attributes( tag="mul_sin_y", precision=self.precision) #, debug = debug_multi) cos_eval_d_1 = (((mul_cos + mul_sin) + mul_coeff_sin) + tabulated_cos) cos_eval_d_1.set_attributes(tag="cos_eval_d_1", precision=self.precision, debug=debug_multi) result_1 = Statement(Return(cos_eval_d_1)) ####################################################################### # LARGE ARGUMENT MANAGEMENT # # (lar: Large Argument Reduction) # ####################################################################### # payne and hanek argument reduction for large arguments ph_k = frac_pi_index ph_frac_pi = round(S2**ph_k / pi, 1500, sollya.RN) ph_inv_frac_pi = pi / S2**ph_k ph_statement, ph_acc, ph_acc_int = generate_payne_hanek(vx, ph_frac_pi, self.precision, n=100, k=ph_k) # assigning Large Argument Reduction reduced variable lar_vx = Variable("lar_vx", precision=self.precision, var_type=Variable.Local) lar_red_vx = Addition(Multiplication(lar_vx, inv_frac_pi, precision=self.precision), Multiplication(lar_vx, inv_frac_pi_lo, precision=self.precision), precision=self.precision, tag="lar_red_vx", debug=debug_multi) C32 = Constant(2**(ph_k + 1), precision=int_precision, tag="C32") ph_acc_int_red = Select(ph_acc_int < C0, C32 + ph_acc_int, ph_acc_int, precision=int_precision, tag="ph_acc_int_red") if self.sin_output: lar_offset_k = Addition(ph_acc_int_red, C_offset, precision=int_precision, tag="lar_offset_k") else: lar_offset_k = ph_acc_int_red ph_acc_int_red.set_attributes(tag="ph_acc_int_red", debug=debug_multi) lar_modk = BitLogicAnd(lar_offset_k, 2**(frac_pi_index + 1) - 1, precision=int_precision, tag="lar_modk", debug=debug_multi) lar_statement = Statement(ph_statement, ReferenceAssign(lar_vx, ph_acc, debug=debug_multi), ReferenceAssign(red_vx, lar_red_vx, debug=debug_multi), ReferenceAssign(modk, lar_modk), prevent_optimization=True) test_NaN_or_Inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, tag="NaN_or_Inf", debug=debug_multi) return_NaN_or_Inf = Statement(Return(FP_QNaN(self.precision))) scheme = ConditionBlock( test_NaN_or_Inf, Statement(ClearException(), return_NaN_or_Inf), Statement( modk, red_vx, ConditionBlock( test_ph_bound, lar_statement, Statement( ReferenceAssign(modk, modk_std), ReferenceAssign(red_vx, red_vx_std), )), result_1)) return scheme
def piecewise_approximation(function, variable, precision, bound_low=-1.0, bound_high=1.0, num_intervals=16, max_degree=2, error_threshold=S2**-24, odd=False, even=False): """ Generate a piecewise approximation :param function: function to be approximated :type function: SollyaObject :param variable: input variable :type variable: Variable :param precision: variable's format :type precision: ML_Format :param bound_low: lower bound for the approximation interval :param bound_high: upper bound for the approximation interval :param num_intervals: number of sub-interval / sub-division of the main interval :param max_degree: maximum degree for an approximation on any sub-interval :param error_threshold: error bound for an approximation on any sub-interval :return: pair (scheme, error) where scheme is a graph node for an approximation scheme of function evaluated at variable, and error is the maximum approximation error encountered :rtype tuple(ML_Operation, SollyaObject): """ degree_generator = piecewise_approximation_degree_generator( function, bound_low, bound_high, num_intervals=num_intervals, error_threshold=error_threshold, ) degree_list = list(degree_generator) # if max_degree is None then we determine it locally if max_degree is None: max_degree = max(degree_list) # table to store coefficients of the approximation on each segment coeff_table = ML_NewTable( dimensions=[num_intervals, max_degree + 1], storage_precision=precision, tag="coeff_table", const=True # by default all approximation coeff table are const ) error_function = lambda p, f, ai, mod, t: sollya.dirtyinfnorm(p - f, ai) max_approx_error = 0.0 interval_size = (bound_high - bound_low) / num_intervals for i in range(num_intervals): subint_low = bound_low + i * interval_size subint_high = bound_low + (i + 1) * interval_size local_function = function(sollya.x + subint_low) local_interval = Interval(-interval_size, interval_size) local_degree = degree_list[i] if local_degree > max_degree: Log.report( Log.Warning, "local_degree {} exceeds max_degree bound ({}) in piecewise_approximation", local_degree, max_degree) # as max_degree defines the size of the table we can use # it as the degree for each sub-interval polynomial # as there is nothing to gain (yet) by using a smaller polynomial degree = max_degree # min(max_degree, local_degree) if function(subint_low) == 0.0: # if the lower bound is a zero to the function, we # need to force value=0 for the constant coefficient # and extend the approximation interval local_poly_degree_list = list( range(1 if even else 0, degree + 1, 2 if odd or even else 1)) poly_object, approx_error = Polynomial.build_from_approximation_with_error( function(sollya.x) / sollya.x, local_poly_degree_list, [precision] * len(local_poly_degree_list), Interval(-subint_high * 0.95, subint_high), sollya.absolute, error_function=error_function) # multiply by sollya.x poly_object = poly_object.sub_poly(offset=-1) else: try: poly_object, approx_error = Polynomial.build_from_approximation_with_error( local_function, degree, [precision] * (degree + 1), local_interval, sollya.absolute, error_function=error_function) except SollyaError as err: # try to see if function is constant on the interval (possible # failure cause for fpminmax) cst_value = precision.round_sollya_object( function(subint_low), sollya.RN) accuracy = error_threshold diff_with_cst_range = sollya.supnorm(cst_value, local_function, local_interval, sollya.absolute, accuracy) diff_with_cst = sup(abs(diff_with_cst_range)) if diff_with_cst < error_threshold: Log.report(Log.Info, "constant polynomial detected") poly_object = Polynomial([function(subint_low)] + [0] * degree) approx_error = diff_with_cst else: Log.report( Log.error, "degree: {} for index {}, diff_with_cst={} (vs error_threshold={}) ", degree, i, diff_with_cst, error_threshold, error=err) for ci in range(max_degree + 1): if ci in poly_object.coeff_map: coeff_table[i][ci] = poly_object.coeff_map[ci] else: coeff_table[i][ci] = 0.0 if approx_error > error_threshold: Log.report( Log.Warning, "piecewise_approximation on index {} exceeds error threshold: {} > {}", i, approx_error, error_threshold) max_approx_error = max(max_approx_error, abs(approx_error)) # computing offset diff = Subtraction(variable, Constant(bound_low, precision=precision), tag="diff", debug=debug_multi, precision=precision) int_prec = precision.get_integer_format() # delta = bound_high - bound_low delta_ratio = Constant(num_intervals / (bound_high - bound_low), precision=precision) # computing table index # index = nearestint(diff / delta * <num_intervals>) index = Max(0, Min( NearestInteger( Multiplication(diff, delta_ratio, precision=precision), precision=int_prec, ), num_intervals - 1), tag="index", debug=debug_multi, precision=int_prec) poly_var = Subtraction(diff, Multiplication( Conversion(index, precision=precision), Constant(interval_size, precision=precision)), precision=precision, tag="poly_var", debug=debug_multi) # generating indexed polynomial coeffs = [(ci, TableLoad(coeff_table, index, ci)) for ci in range(max_degree + 1)][::-1] poly_scheme = PolynomialSchemeEvaluator.generate_horner_scheme2( coeffs, poly_var, precision, {}, precision) return poly_scheme, max_approx_error
def generate_scheme(self): """Produce an abstract scheme for the logarithm. This abstract scheme will be used by the code generation backend. """ if self.precision not in [ML_Binary32, ML_Binary64]: Log.report(Log.Error, "The demanded precision is not supported") vx = self.implementation.add_input_variable("x", self.precision) def default_bool_convert(optree, precision=None, **kw): return bool_convert(optree, precision, -1, 0, **kw) \ if isinstance(self.processor, VectorBackend) \ else bool_convert(optree, precision, 1, 0, **kw) precision = self.precision.sollya_object int_prec = self.precision.get_integer_format() Log.report(Log.Info, "int_prec is %s" % int_prec) uint_prec = self.precision.get_unsigned_integer_format() Log.report(Log.Info, "MDL constants") cgpe_scheme_idx = int(self.cgpe_index) table_index_size = int(self.tbl_index_size) # table_nb_elements = 2**(table_index_size) table_dimensions = [2*table_nb_elements] # two values are stored for each element field_size = Constant(self.precision.get_field_size(), precision = int_prec, tag = 'field_size') if self.log_radix == EXP_1: log2_hi = Constant( round(log(2), precision, sollya.RN), precision = self.precision, tag = 'log2_hi') log2_lo = Constant( round(log(2) - round(log(2), precision, sollya.RN), precision, sollya.RN), precision = self.precision, tag = 'log2_lo') elif self.log_radix == 10: log2_hi = Constant( round(log10(2), precision, sollya.RN), precision = self.precision, tag = 'log2_hi') log2_lo = Constant( round(log10(2) - round(log10(2), precision, sollya.RN), precision, sollya.RN), precision = self.precision, tag = 'log2_lo') # ... if log_radix == '2' then log2(2) == 1 # subnormal_mask aims at trapping positive subnormals except zero. # That's why we will subtract 1 to the integer bitstring of the input, and # then compare for Less (strict) the resulting integer bitstring to this # mask, e.g. 0x7fffff for binary32. if self.no_subnormal == False: subnormal_mask = Constant((1 << self.precision.get_field_size()) - 1, precision = int_prec, tag = 'subnormal_mask') fp_one = Constant(1.0, precision = self.precision, tag = 'fp_one') fp_one_as_uint = TypeCast(fp_one, precision = uint_prec, tag = 'fp_one_as_uint') int_zero = Constant(0, precision = int_prec, tag = 'int_zero') int_one = Constant(1, precision = int_prec, tag = 'int_one') table_mantissa_half_ulp = Constant( 1 << (self.precision.field_size - table_index_size - 1), precision = int_prec ) table_s_exp_index_mask = Constant( ~((table_mantissa_half_ulp.get_value() << 1) - 1), precision = uint_prec ) Log.report(Log.Info, "MDL table") # The table holds approximations of -log(2^tau * r_i) so we first compute # the index value for which tau changes from 1 to 0. cut = sqrt(2.) tau_index_limit = floor(table_nb_elements * (2./cut - 1)) sollya_logtbl = [ (-log1p(float(i) / table_nb_elements) + (0 if i <= tau_index_limit else log(2.))) / log(self.log_radix) for i in range(table_nb_elements) ] # ... init_logtbl_hi = [ round(sollya_logtbl[i], self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] init_logtbl_lo = [ round(sollya_logtbl[i] - init_logtbl_hi[i], self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] init_logtbl = [tmp[i] for i in range(len(init_logtbl_hi)) for tmp in [init_logtbl_hi, init_logtbl_lo]] log1p_table = ML_NewTable(dimensions = table_dimensions, storage_precision = self.precision, init_data = init_logtbl, tag = 'ml_log1p_table') # ... if self.no_rcp: sollya_rcptbl = [ (1/((1+float(i)/table_nb_elements)+2**(-1-int(self.tbl_index_size)))) for i in range(table_nb_elements) ] init_rcptbl = [ round(sollya_rcptbl[i], int(self.tbl_index_size)+1, # self.precision.get_mantissa_size(), sollya.RN) for i in range(table_nb_elements) ] rcp_table = ML_NewTable(dimensions = [table_nb_elements], storage_precision = self.precision, init_data = init_rcptbl, tag = 'ml_rcp_table') # ... Log.report(Log.Info, 'MDL unified subnormal handling') vx_as_int = TypeCast(vx, precision = int_prec, tag = 'vx_as_int') if self.no_subnormal == False: vx_as_uint = TypeCast(vx, precision = uint_prec, tag = 'vx_as_uint') # Avoid the 0.0 case by subtracting 1 from vx_as_int tmp = Comparison(vx_as_int - 1, subnormal_mask, specifier = Comparison.Less) is_subnormal = default_bool_convert( tmp, # Will catch negative values as well as NaNs with sign bit set precision = int_prec) is_subnormal.set_attributes(tag = "is_subnormal") if not(isinstance(self.processor, VectorBackend)): is_subnormal = Subtraction(Constant(0, precision = int_prec), is_subnormal, precision = int_prec) ################################################# # Vectorizable integer based subnormal handling # ################################################# # 1. lzcnt # custom lzcount-like for subnormal numbers using FPU (see draft article) Zi = BitLogicOr(vx_as_uint, fp_one_as_uint, precision = uint_prec, tag="Zi") Zf = Subtraction( TypeCast(Zi, precision = self.precision), fp_one, precision = self.precision, tag="Zf") # Zf exponent is -(nlz(x) - exponent_size). # 2. compute shift value # Vectorial comparison on x86+sse/avx is going to look like # '|0x00|0xff|0x00|0x00|' and that's why we use Negate. # But for scalar code generation, comparison will rather be either 0 or 1 # in C. Thus mask below won't be correct for a scalar implementation. # FIXME: Can we know the backend that will be called and choose in # consequence? Should we make something arch-agnostic instead? # n_value = BitLogicAnd( Addition( DirtyExponentExtraction(Zf, self.precision), Constant( self.precision.get_bias(), precision = int_prec), precision = int_prec), is_subnormal, precision = int_prec, tag = "n_value") alpha = Negation(n_value, tag="alpha") # # 3. shift left # renormalized_mantissa = BitLogicLeftShift(vx_as_int, value) normal_vx_as_int = BitLogicLeftShift(vx_as_int, alpha) # 4. set exponent to the right value # Compute the exponent to add : (p-1)-(value) + 1 = p-1-value # The final "+ 1" comes from the fact that once renormalized, the # floating-point datum has a biased exponent of 1 #tmp0 = Subtraction( # field_size, # value, # precision = int_prec, # tag="tmp0") # Set the value to 0 if the number is not subnormal #tmp1 = BitLogicAnd(tmp0, is_subnormal) #renormalized_exponent = BitLogicLeftShift( # tmp1, # field_size # ) else: # no_subnormal == True normal_vx_as_int = vx_as_int #normal_vx_as_int = renormalized_mantissa + renormalized_exponent normal_vx = TypeCast(normal_vx_as_int, precision = self.precision, tag = 'normal_vx') # alpha = BitLogicAnd(field_size, is_subnormal, tag = 'alpha') # XXX Extract the mantissa, see if this is supported in the x86 vector # backend or if it still uses the support_lib. vx_mantissa = MantissaExtraction(normal_vx, precision = self.precision) Log.report(Log.Info, "MDL scheme") if self.force_division == True: rcp_m = Division(fp_one, vx_mantissa, precision = self.precision) elif self.no_rcp == False: rcp_m = ReciprocalSeed(vx_mantissa, precision = self.precision) if not self.processor.is_supported_operation(rcp_m): if self.precision == ML_Binary64: # Try using a binary32 FastReciprocal binary32_m = Conversion(vx_mantissa, precision = ML_Binary32) rcp_m = ReciprocalSeed(binary32_m, precision = ML_Binary32) rcp_m = Conversion(rcp_m, precision = ML_Binary64) if not self.processor.is_supported_operation(rcp_m): # FIXME An approximation table could be used instead but for vector # implementations another GATHER would be required. # However this may well be better than a division... rcp_m = Division(fp_one, vx_mantissa, precision = self.precision) else: # ... use a look-up table rcp_shift = BitLogicLeftShift(normal_vx_as_int, self.precision.get_exponent_size() + 1) rcp_idx = BitLogicRightShift(rcp_shift, self.precision.get_exponent_size() + 1 + self.precision.get_field_size() - int(self.tbl_index_size)) rcp_m = TableLoad(rcp_table, rcp_idx, tag = 'rcp_idx', debug = debug_multi) # rcp_m.set_attributes(tag = 'rcp_m') # exponent is normally either 0 or -1, since m is in [1, 2). Possible # optimization? # exponent = ExponentExtraction(rcp_m, precision = self.precision, # tag = 'exponent') ri_round = TypeCast( Addition( TypeCast(rcp_m, precision = int_prec), table_mantissa_half_ulp, precision = int_prec ), precision = uint_prec ) ri_fast_rndn = BitLogicAnd( ri_round, table_s_exp_index_mask, tag = 'ri_fast_rndn', precision = uint_prec ) # u = m * ri - 1 ul = None if self.no_rcp == True: # ... u does not fit on a single word tmp_u, tmp_ul = Mul211(vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fma = (self.no_fma == False)) fp_minus_one = Constant(-1.0, precision = self.precision, tag = 'fp_minus_one') u, ul = Add212(fp_minus_one, tmp_u, tmp_ul) u.set_attributes(tag='uh') ul.set_attributes(tag='ul') elif self.no_fma == False: u = FusedMultiplyAdd( vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fp_one, specifier = FusedMultiplyAdd.Subtract, tag = 'u') else: # disable FMA # tmph + tmpl = m * ri, where tmph ~ 1 tmph, tmpl = Mul211(vx_mantissa, TypeCast(ri_fast_rndn, precision = self.precision), fma = False) # u_tmp = tmph - 1 ... exact due to Sterbenz u_tmp = Subtraction(tmph, fp_one, precision = self.precision) # u = u_tmp - tmpl ... exact since the result u is representable as a single word u = Addition(u_tmp, tmpl, precision = self.precision, tag = 'u') unneeded_bits = Constant( self.precision.field_size - table_index_size, precision=uint_prec, tag="unneeded_bits" ) assert self.precision.field_size - table_index_size >= 0 ri_bits = BitLogicRightShift( ri_fast_rndn, unneeded_bits, precision = uint_prec, tag = "ri_bits" ) # Retrieve mantissa's MSBs + first bit of exponent, for tau computation in case # exponent is 0 (i.e. biased 127, i.e. first bit of exponent is set.). # In this particular case, i = 0 but tau is 1 # table_index does not need to be as long as uint_prec might be, # try and keep it the size of size_t. size_t_prec = ML_UInt32 signed_size_t_prec = ML_Int32 table_index_mask = Constant( (1 << (table_index_size + 1)) - 1, precision = size_t_prec ) table_index = BitLogicAnd( Conversion(ri_bits, precision = size_t_prec), table_index_mask, tag = 'table_index', precision = size_t_prec ) # Compute tau using the tau_index_limit value. tmp = default_bool_convert( Comparison( TypeCast(table_index, precision = signed_size_t_prec), Constant(tau_index_limit, precision = signed_size_t_prec), specifier = Comparison.Greater if isinstance(self.processor, VectorBackend) else Comparison.LessOrEqual ), precision = signed_size_t_prec, tag="tmp" ) # A true tmp will typically be -1 for VectorBackends, but 1 for standard C. tau = Conversion( Addition(tmp, Constant(1, precision=signed_size_t_prec), precision = signed_size_t_prec, tag="pre_add") if isinstance(self.processor, VectorBackend) else tmp, precision=int_prec, tag="pre_tau" ) tau.set_attributes(tag = 'tau') # Update table_index: keep only table_index_size bits table_index_hi = BitLogicAnd( table_index, Constant((1 << table_index_size) - 1, precision = size_t_prec), precision = size_t_prec ) # table_index_hi = table_index_hi << 1 table_index_hi = BitLogicLeftShift( table_index_hi, Constant(1, precision = size_t_prec), precision = size_t_prec, tag = "table_index_hi" ) # table_index_lo = table_index_hi + 1 table_index_lo = Addition( table_index_hi, Constant(1, precision = size_t_prec), precision = size_t_prec, tag = "table_index_lo" ) tbl_hi = TableLoad(log1p_table, table_index_hi, tag = 'tbl_hi', debug = debug_multi) tbl_lo = TableLoad(log1p_table, table_index_lo, tag = 'tbl_lo', debug = debug_multi) # Compute exponent e + tau - alpha, but first subtract the bias. if self.no_subnormal == False: tmp_eptau = Addition( Addition( BitLogicRightShift( normal_vx_as_int, field_size, tag = 'exponent', interval = self.precision.get_exponent_interval(), precision = int_prec), Constant( self.precision.get_bias(), precision = int_prec)), tau, tag = 'tmp_eptau', precision = int_prec) exponent = Subtraction(tmp_eptau, alpha, precision = int_prec) else: exponent = Addition( Addition( BitLogicRightShift( normal_vx_as_int, field_size, tag = 'exponent', interval = self.precision.get_exponent_interval(), precision = int_prec), Constant( self.precision.get_bias(), precision = int_prec)), tau, tag = 'tmp_eptau', precision = int_prec) # fp_exponent = Conversion(exponent, precision = self.precision, tag = 'fp_exponent') Log.report(Log.Info, 'MDL polynomial approximation') if self.log_radix == EXP_1: sollya_function = log(1 + sollya.x) elif self.log_radix == 2: sollya_function = log2(1 + sollya.x) elif self.log_radix == 10: sollya_function = log10(1 + sollya.x) # ... if self.force_division == True: # rcp accuracy is 2^(-p) boundrcp = 2**(-self.precision.get_precision()) else: boundrcp = 1.5 * 2**(-12) # ... see Intel intrinsics guide if self.precision in [ML_Binary64]: if not self.processor.is_supported_operation(rcp_m): boundrcp = (1+boundrcp)*(1+2**(-24)) - 1 else: boundrcp = 2**(-14) # ... see Intel intrinsics guide arg_red_mag = boundrcp + 2**(-table_index_size-1) + boundrcp * 2**(-table_index_size-1) if self.no_rcp == False: approx_interval = Interval(-arg_red_mag, arg_red_mag) else: approx_interval = Interval(-2**(-int(self.tbl_index_size)+1),2**(-int(self.tbl_index_size)+1)) max_eps = 2**-(2*(self.precision.get_field_size())) Log.report(Log.Info, "max acceptable error for polynomial = {}".format(float.hex(max_eps))) poly_degree = sup( guessdegree( sollya_function, approx_interval, max_eps, ) ) Log.report(Log.Info, "poly degree is ", poly_degree) if self.log_radix == EXP_1: poly_object = Polynomial.build_from_approximation( sollya_function, range(2, int(poly_degree) + 1), # Force 1st 2 coeffs to 0 and 1, resp. # Emulate double-self.precision coefficient formats [self.precision.get_mantissa_size()*2 + 1]*(poly_degree - 1), approx_interval, sollya.absolute, 0 + sollya._x_) # Force the first 2 coefficients to 0 and 1, resp. else: # ... == '2' or '10' poly_object = Polynomial.build_from_approximation( sollya_function, range(1, int(poly_degree) + 1), # Force 1st coeff to 0 # Emulate double-self.precision coefficient formats [self.precision.get_mantissa_size()*2 + 1]*(poly_degree), approx_interval, sollya.absolute, 0) # Force the first coefficients to 0 Log.report(Log.Info, str(poly_object)) constant_precision = ML_SingleSingle if self.precision == ML_Binary32 \ else ML_DoubleDouble if self.precision == ML_Binary64 \ else None if is_cgpe_available(): log1pu_poly = PolynomialSchemeEvaluator.generate_cgpe_scheme( poly_object, u, unified_precision = self.precision, constant_precision = constant_precision, scheme_id = cgpe_scheme_idx ) else: Log.report(Log.Warning, "CGPE not available, falling back to std poly evaluator") log1pu_poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, u, unified_precision = self.precision, constant_precision = constant_precision ) # XXX Dirty implementation of double-(self.precision) poly def dirty_poly_node_conversion(node, variable_h, variable_l, use_fma): return dirty_multi_node_expand( node, self.precision, mem_map={variable_h: (variable_h, variable_l)}, fma=use_fma) log1pu_poly_hi, log1pu_poly_lo = dirty_poly_node_conversion(log1pu_poly, u, ul, use_fma=(self.no_fma == False)) log1pu_poly_hi.set_attributes(tag = 'log1pu_poly_hi') log1pu_poly_lo.set_attributes(tag = 'log1pu_poly_lo') # Compute log(2) * (e + tau - alpha) if self.log_radix != 2: # 'e' or '10' log2e_hi, log2e_lo = Mul212(fp_exponent, log2_hi, log2_lo, fma = (self.no_fma == False)) # Add log1p(u) if self.log_radix != 2: # 'e' or '10' tmp_res_hi, tmp_res_lo = Add222(log2e_hi, log2e_lo, log1pu_poly_hi, log1pu_poly_lo) else: tmp_res_hi, tmp_res_lo = Add212(fp_exponent, log1pu_poly_hi, log1pu_poly_lo) # Add -log(2^(tau)/m) approximation retrieved by two table lookups logx_hi = Add122(tmp_res_hi, tmp_res_lo, tbl_hi, tbl_lo)[0] logx_hi.set_attributes(tag = 'logx_hi') scheme = Return(logx_hi, precision = self.precision) return scheme
def generate_scheme(self): #func_implementation = CodeFunction(self.function_name, output_format = self.precision) vx = self.implementation.add_input_variable("x", self.get_input_precision()) sollya_precision = self.get_input_precision().get_sollya_object() # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) log2_hi_value = round( log10(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), RN) log2_lo_value = round( log10(2) - log2_hi_value, self.precision.sollya_object, RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) int_precision = self.precision.get_integer_format() # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = 7 table_index_range = range(1, 2**table_index_size) log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in table_index_range: #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 #inv_value = (1.0 + (inv_approx_table[i][0] / S2**9) ) * S2**-1 inv_value = inv_approx_table[i][0] value_high = round( log10(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round( log10(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low # determining log_table range high_index_function = lambda table, i: table[i][0] low_index_function = lambda table, i: table[i][1] table_high_interval = log_table.get_subset_interval( high_index_function, table_index_range) table_low_interval = log_table.get_subset_interval( low_index_function, table_index_range) def compute_log(_vx, exp_corr_factor=None): _vx_mant = MantissaExtraction(_vx, tag="_vx_mant", debug=debug_lftolx) _vx_exp = ExponentExtraction(_vx, tag="_vx_exp", debug=debugd) table_index = BitLogicAnd(BitLogicRightShift( TypeCast(_vx_mant, precision=int_precision, debug=debuglx), self.precision.get_field_size() - 7, debug=debuglx), 0x7f, tag="table_index", debug=debuglld) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd( TypeCast(DivisionSeed(_vx_mant, precision=self.precision, tag="seed", debug=debug_lftolx, silent=True), precision=ML_UInt64), Constant(-2, precision=ML_UInt64), precision=ML_UInt64), precision=self.precision, tag="pre_arg_red_index", debug=debug_lftolx) arg_red_index = Select(Equal(table_index, 0), 1.0, pre_arg_red_index, tag="arg_red_index", debug=debug_lftolx) #if not processor.is_supported_operation(arg_red_index): # if self.precision != ML_Binary32: # arg_red_index = DivisionSeed(Conversion(_vx_mant, precision = ML_Binary32), precision = ML_Binary32, _red_vx = arg_red_index * _vx_mant - 1.0 inv_err = S2**-7 red_interval = Interval(1 - inv_err, 1 + inv_err) _red_vx.set_attributes(tag="_red_vx", debug=debug_lftolx, interval=red_interval) # return in case of standard (non-special) input _log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_lftolx) _log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_lftolx) print("building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log10(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log10(1 + x) / x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute) poly_object = global_poly_object #.sub_poly(start_index = 1) print("generating polynomial evaluation scheme") _poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, _red_vx, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_lftolx) print(global_poly_object.get_sollya_object()) corr_exp = Conversion( _vx_exp if exp_corr_factor == None else _vx_exp + exp_corr_factor, precision=self.precision) split_red_vx = Split(_red_vx, precision=ML_DoubleDouble, tag="split_red_vx", debug=debug_ddtolx) red_vx_hi = split_red_vx.hi red_vx_lo = split_red_vx.lo # result = _red_vx * poly - log_inv_hi - log_inv_lo + _vx_exp * log2_hi + _vx_exp * log2_lo pre_result = -_log_inv_hi + ((_red_vx * _poly + (corr_exp * log2_lo - _log_inv_lo))) pre_result.set_attributes(tag="pre_result", debug=debug_lftolx) exact_log2_hi_exp = corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_hex", debug=debug_lftolx) cancel_part = (corr_exp * log2_hi - _log_inv_hi) cancel_part.set_attributes(tag="cancel_part", debug=debug_lftolx) sub_part = red_vx_hi + cancel_part sub_part.set_attributes(tag="sub_part", debug=debug_lftolx) #result_one_low_part = (red_vx_hi * _poly + (red_vx_lo + (red_vx_lo * _poly + (corr_exp * log2_lo - _log_inv_lo)))) result_one_low_part = ((red_vx_lo + (red_vx_lo * _poly + (corr_exp * log2_lo - _log_inv_lo)))) result_one_low_part.set_attributes(tag="result_one_low_part", debug=debug_lftolx) _result_one = ( (sub_part) + red_vx_hi * _poly) + result_one_low_part _result = exact_log2_hi_exp + pre_result return _result, _poly, _log_inv_lo, _log_inv_hi, _red_vx, _result_one, corr_exp result, poly, log_inv_lo, log_inv_hi, red_vx, new_result_one, corr_exp = compute_log( vx) result.set_attributes(tag="result", debug=debug_lftolx) new_result_one.set_attributes(tag="new_result_one", debug=debug_lftolx) # building eval error map eval_error_map = { red_vx: Variable("red_vx", precision=self.precision, interval=red_vx.get_interval()), log_inv_hi: Variable("log_inv_hi", precision=self.precision, interval=table_high_interval), log_inv_lo: Variable("log_inv_lo", precision=self.precision, interval=table_low_interval), corr_exp: Variable("corr_exp_g", precision=self.precision, interval=self.precision.get_exponent_interval()), } # computing gappa error if is_gappa_installed(): poly_eval_error = self.get_eval_error(result, eval_error_map) print("poly_eval_error: ", poly_eval_error) neg_input = Comparison(vx, 0, likely=False, specifier=Comparison.Less, debug=debugd, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debugd, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debugd, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debugd, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debugd, tag="vx_subnormal") vx_zero = Test(vx, specifier=Test.IsZero, likely=False, debug=debugd, tag="vx_zero") exp_mone = Equal(vx_exp, -1, tag="exp_minus_one", debug=debugd, likely=False) vx_one = Equal(vx, 1.0, tag="vx_one", likely=False, debug=debugd) # exp=-1 case print("managing exp=-1 case") #red_vx_2 = arg_red_index * vx_mant * 0.5 #approx_interval2 = Interval(0.5 - inv_err, 0.5 + inv_err) #poly_degree2 = sup(guessdegree(log(x), approx_interval2, S2**-(self.precision.get_field_size()+1))) + 1 #poly_object2 = Polynomial.build_from_approximation(log(sollya.x), poly_degree, [self.precision]*(poly_degree+1), approx_interval2, sollya.absolute) #print "poly_object2: ", poly_object2.get_sollya_object() #poly2 = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object2, red_vx_2, unified_precision = self.precision) #poly2.set_attributes(tag = "poly2", debug = debug_lftolx) #result2 = (poly2 - log_inv_hi - log_inv_lo) log_subtract = -log_inv_hi - log2_hi log_subtract.set_attributes(tag="log_subtract", debug=debug_lftolx) result2 = (log_subtract) + ((poly * red_vx) - (log_inv_lo + log2_lo)) result2.set_attributes(tag="result2", debug=debug_lftolx) m100 = -100 S2100 = Constant(S2**100, precision=self.precision) result_subnormal, _, _, _, _, _, _ = compute_log(vx * S2100, exp_corr_factor=m100) print("managing close to 1.0 cases") one_err = S2**-7 approx_interval_one = Interval(-one_err, one_err) red_vx_one = vx - 1.0 poly_degree_one = sup( guessdegree( log10(1 + sollya.x) / sollya.x, approx_interval_one, S2** -(self.precision.get_field_size() + 1))) + 1 poly_object_one = Polynomial.build_from_approximation( log10(1 + sollya.x) / sollya.x, poly_degree_one, [self.precision] * (poly_degree_one + 1), approx_interval_one, sollya.absolute).sub_poly(start_index=1) poly_one = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object_one, red_vx_one, unified_precision=self.precision) poly_one.set_attributes(tag="poly_one", debug=debug_lftolx) result_one = red_vx_one + red_vx_one * poly_one cond_one = (vx < (1 + one_err)) & (vx > (1 - one_err)) cond_one.set_attributes(tag="cond_one", debug=debugd, likely=False) # main scheme print("MDL scheme") pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, ConditionBlock( vx_zero, Statement( ClearException(), Raise(ML_FPE_DivideByZero), Return(FP_MinusInfty(self.precision)), ), Return(result_subnormal)), ConditionBlock( vx_one, Statement( ClearException(), Return(FP_PlusZero(self.precision)), ), ConditionBlock(exp_mone, Return(result2), Return(result)) #ConditionBlock(cond_one, #Return(new_result_one), #ConditionBlock(exp_mone, #Return(result2), #Return(result) #) #) )))) scheme = pre_scheme return scheme
def generate_scheme(self): def compute_reciprocal(vx): inv_seed = ReciprocalSeed(vx, precision = self.precision, tag = "inv_seed", debug = debug_multi) nr_1 = 2*inv_seed - vx*inv_seed*inv_seed nr_2 = 2*nr_1 - vx*nr_1*nr_1 nr_3 =2*nr_2 - vx*nr_2*nr_2 inv_vx = 2*nr_3 - vx*nr_3*nr_3 return inv_vx vx = self.implementation.add_input_variable("x", self.get_input_precision()) sollya_precision = self.precision.get_sollya_object() int_precision = { ML_Binary32 : ML_Int32, ML_Binary64 : ML_Int64 }[self.precision] hi_precision = self.precision.get_field_size() - 12 half_pi = round(pi/2, sollya_precision, sollya.RN) half_pi_cst = Constant(half_pi, precision = self.precision) test_sign = Comparison(vx, 0, specifier = Comparison.Less, precision = ML_Bool, debug = debug_multi, tag = "Is_Negative") neg_vx = -vx sign = Variable("sign", precision = self.precision, var_type = Variable.Local) abs_vx_std = Variable("abs_vx", precision = self.precision, var_type = Variable.Local) red_vx_std = Variable("red_vx", precision = self.precision, var_type = Variable.Local) const_index_std = Variable("const_index", precision = int_precision, var_type = Variable.Local) set_sign = Statement( ConditionBlock(test_sign, Statement(ReferenceAssign(abs_vx_std, neg_vx), ReferenceAssign(sign, -1)), Statement(ReferenceAssign(abs_vx_std, vx), ReferenceAssign(sign, 1)) )) if self.precision is ML_Binary32: bound = 24 else: bound = 53 test_bound = Comparison(abs_vx_std, S2**bound, specifier = Comparison.GreaterOrEqual, precision = ML_Bool)#, debug = debug_multi, tag ="bound") test_bound1 = Comparison(abs_vx_std, 39.0/16.0, specifier = Comparison.GreaterOrEqual, precision = ML_Bool)#, debug = debug_multi, tag ="bound") test_bound2 = Comparison(abs_vx_std, 19.0/16.0, specifier = Comparison.GreaterOrEqual, precision = ML_Bool)#, debug = debug_multi, tag ="bound") test_bound3 = Comparison(abs_vx_std, 11.0/16.0, specifier = Comparison.GreaterOrEqual, precision = ML_Bool)#, debug = debug_multi, tag ="bound") test_bound4 = Comparison(abs_vx_std, 7.0/16.0, specifier = Comparison.GreaterOrEqual, precision = ML_Bool)#, debug = debug_multi, tag ="bound") set_bound = Return(sign*half_pi_cst) set_bound1 = Statement( ReferenceAssign(red_vx_std, -compute_reciprocal(abs_vx_std)), ReferenceAssign(const_index_std, 3) ) set_bound2 = Statement( ReferenceAssign(red_vx_std, (abs_vx_std - 1.5)*compute_reciprocal(1 + 1.5*abs_vx_std)), ReferenceAssign(const_index_std, 2) ) set_bound3 = Statement( ReferenceAssign(red_vx_std, (abs_vx_std - 1.0)*compute_reciprocal(abs_vx_std + 1.0)), ReferenceAssign(const_index_std, 1) ) set_bound4 = Statement( ReferenceAssign(red_vx_std, (abs_vx_std - 0.5)*compute_reciprocal(1 + abs_vx_std*0.5)), ReferenceAssign(const_index_std, 0) ) set_bound5 = Statement( ReferenceAssign(red_vx_std, abs_vx_std), ReferenceAssign(const_index_std, 4) ) cons_table = ML_NewTable(dimensions = [5, 2], storage_precision = self.precision, tag = self.uniquify_name("cons_table")) coeff_table = ML_NewTable(dimensions = [11], storage_precision = self.precision, tag = self.uniquify_name("coeff_table")) cons_hi = round(atan(0.5), hi_precision, sollya.RN) cons_table[0][0] = cons_hi cons_table[0][1] = round(atan(0.5) - cons_hi, sollya_precision, sollya.RN) cons_hi = round(atan(1.0), hi_precision, sollya.RN) cons_table[1][0] = cons_hi cons_table[1][1] = round(atan(1.0) - cons_hi, sollya_precision, sollya.RN) cons_hi = round(atan(1.5), hi_precision, sollya.RN) cons_table[2][0] = cons_hi cons_table[2][1] = round(atan(1.5) - cons_hi, sollya_precision, sollya.RN) cons_hi = round(pi/2, hi_precision, sollya.RN) cons_table[3][0] = cons_hi cons_table[3][1] = round(pi/2 - cons_hi, sollya_precision, sollya.RN) cons_table[4][0] = 0.0 cons_table[4][1] = 0.0 coeff_table[0] = round(3.33333333333329318027e-01, sollya_precision, sollya.RN) coeff_table[1] = round(-1.99999999998764832476e-01, sollya_precision, sollya.RN) coeff_table[2] = round(1.42857142725034663711e-01, sollya_precision, sollya.RN) coeff_table[3] = round(-1.11111104054623557880e-01, sollya_precision, sollya.RN) coeff_table[4] = round(9.09088713343650656196e-02, sollya_precision, sollya.RN) coeff_table[5] = round(-7.69187620504482999495e-02, sollya_precision, sollya.RN) coeff_table[6] = round(6.66107313738753120669e-02, sollya_precision, sollya.RN) coeff_table[7] = round(-5.83357013379057348645e-02, sollya_precision, sollya.RN) coeff_table[8] = round(4.97687799461593236017e-02, sollya_precision, sollya.RN) coeff_table[9] = round(-3.65315727442169155270e-02, sollya_precision, sollya.RN) coeff_table[10] = round(1.62858201153657823623e-02, sollya_precision, sollya.RN) red_vx2 = red_vx_std*red_vx_std red_vx4 = red_vx2*red_vx2 a0 = TableLoad(coeff_table, 0, precision = self.precision) a1 = TableLoad(coeff_table, 1, precision = self.precision) a2 = TableLoad(coeff_table, 2, precision = self.precision) a3 = TableLoad(coeff_table, 3, precision = self.precision) a4 = TableLoad(coeff_table, 4, precision = self.precision) a5 = TableLoad(coeff_table, 5, precision = self.precision) a6 = TableLoad(coeff_table, 6, precision = self.precision) a7 = TableLoad(coeff_table, 7, precision = self.precision) a8 = TableLoad(coeff_table, 8, precision = self.precision) a9 = TableLoad(coeff_table, 9, precision = self.precision) a10 = TableLoad(coeff_table, 10, precision = self.precision) poly_even = red_vx2*(a0 + red_vx4*(a2 + red_vx4*(a4 + red_vx4*(a6 + red_vx4*(a8 + red_vx4*a10))))) poly_odd = red_vx4*(a1 + red_vx4*(a3 + red_vx4*(a5 + red_vx4*(a7 + red_vx4*a9)))) poly_even.set_attributes(tag = "poly_even", debug = debug_multi) poly_odd.set_attributes(tag = "poly_odd", debug = debug_multi) const_load_hi = TableLoad(cons_table, const_index_std, 0, tag = "const_load_hi", debug = debug_multi) const_load_lo = TableLoad(cons_table, const_index_std, 1, tag = "const_load_lo", debug = debug_multi) test_NaN_or_inf = Test(vx, specifier = Test.IsInfOrNaN, tag = "nan_or_inf", likely = False) test_nan = Test(vx, specifier = Test.IsNaN, debug = debug_multi, tag = "is_nan_test", likely = False) test_positive = Comparison(vx, 0, specifier = Comparison.GreaterOrEqual, debug = debug_multi, tag = "inf_sign", likely = False) result = const_load_hi - ((red_vx_std*(poly_even + poly_odd) - const_load_lo) - red_vx_std) result.set_attributes(tag = "result", debug = debug_multi) std_scheme = Statement( sign, abs_vx_std, red_vx_std, const_index_std, set_sign, ConditionBlock( test_bound, set_bound, ConditionBlock( test_bound1, set_bound1, ConditionBlock( test_bound2, set_bound2, ConditionBlock( test_bound3, set_bound3, ConditionBlock( test_bound4, set_bound4, set_bound5 ) ) ) ) ), Return(sign*result) ) infty_return = ConditionBlock(test_positive, Return(half_pi_cst), Return(-half_pi_cst)) non_std_return = ConditionBlock(test_nan, Return(FP_QNaN(self.precision)), infty_return) scheme = ConditionBlock(test_NaN_or_inf, Statement(ClearException(), non_std_return), std_scheme) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.get_input_precision()) sollya_precision = self.get_input_precision().get_sollya_object() log_f = sollya.log(sollya.x) # /sollya.log(self.basis) # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) log2_hi_value = round( log_f(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), RN) log2_lo_value = round( log_f(2) - log2_hi_value, self.precision.sollya_object, RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) int_precision = self.precision.get_integer_format() vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debug_multi) #--------------------- # Approximation scheme #--------------------- # log10(x) = log10(m.2^e) = log10(m.2^(e-t+t)) # = log10(m.2^-t) + (e+t) log10(2) # t = (m > sqrt(2)) ? 1 : 0 is used to avoid catastrophic cancellation # when e = -1 and m ~ 2 # # # log10(m.2^-t) = log10(m.r/r.2^-t) = log10(m.r) + log10(2^-t/r) # = log10(m.r) - log10(r.2^t) # where r = rcp(m) an approximation of 1/m such that r.m ~ 1 # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = inv_approx_table.index_size table_index_range = range(1, 2**table_index_size) log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision) log_table_tho = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 log_table_tho[0][0] = 0.0 log_table_tho[0][1] = 0.0 hi_size = self.precision.get_field_size() - ( self.precision.get_exponent_size() + 1) for i in table_index_range: #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 #inv_value = (1.0 + (inv_approx_table[i][0] / S2**9) ) * S2**-1 inv_value = inv_approx_table[i] value_high = round(log_f(inv_value), hi_size, sollya.RN) value_low = round( log_f(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low inv_value_tho = S2 * inv_approx_table[i] value_high_tho = round(log_f(inv_value_tho), hi_size, sollya.RN) value_low_tho = round( log_f(inv_value_tho) - value_high_tho, sollya_precision, sollya.RN) log_table_tho[i][0] = value_high_tho log_table_tho[i][1] = value_low_tho # determining log_table range high_index_function = lambda table, i: table[i][0] low_index_function = lambda table, i: table[i][1] table_high_interval = log_table.get_subset_interval( high_index_function, table_index_range) table_low_interval = log_table.get_subset_interval( low_index_function, table_index_range) def compute_log(_vx, exp_corr_factor=None): _vx_mant = MantissaExtraction(_vx, tag="_vx_mant", precision=self.precision, debug=debug_multi) _vx_exp = ExponentExtraction(_vx, tag="_vx_exp", debug=debug_multi) table_index = inv_approx_table.index_function(_vx_mant) table_index.set_attributes(tag="table_index", debug=debug_multi) tho_cond = _vx_mant > Constant(sollya.sqrt(2), precision=self.precision) tho = Select(tho_cond, Constant(1.0, precision=self.precision), Constant(0.0, precision=self.precision), precision=self.precision, tag="tho", debug=debug_multi) rcp = ReciprocalSeed(_vx_mant, precision=self.precision, tag="rcp") r = Multiplication(rcp, _vx_mant, precision=self.precision, tag="r") int_format = self.precision.get_integer_format() # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd( TypeCast(ReciprocalSeed(_vx_mant, precision=self.precision, tag="seed", debug=debug_multi, silent=True), precision=int_format), Constant(-2, precision=int_format), precision=int_format), precision=self.precision, tag="pre_arg_red_index", debug=debug_multi) arg_red_index = Select(Equal(table_index, 0), 1.0, pre_arg_red_index, tag="arg_red_index", debug=debug_multi) _red_vx = arg_red_index * _vx_mant - 1.0 inv_err = S2**-6 red_interval = Interval(1 - inv_err, 1 + inv_err) _red_vx.set_attributes(tag="_red_vx", debug=debug_multi, interval=red_interval) # return in case of standard (non-special) input _log_inv_lo = Select(tho_cond, TableLoad(log_table_tho, table_index, 1), TableLoad(log_table, table_index, 1), tag="log_inv_lo", debug=debug_multi) _log_inv_hi = Select(tho_cond, TableLoad(log_table_tho, table_index, 0), TableLoad(log_table, table_index, 0), tag="log_inv_hi", debug=debug_multi) Log.report(Log.Info, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() + 1))) + 1 global_poly_object = Polynomial.build_from_approximation( log(1 + x) / x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index=1) Log.report(Log.Info, "generating polynomial evaluation scheme") _poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, _red_vx, unified_precision=self.precision) _poly.set_attributes(tag="poly", debug=debug_multi) Log.report(Log.Info, poly_object.get_sollya_object()) corr_exp = Conversion(_vx_exp if exp_corr_factor == None else _vx_exp + exp_corr_factor, precision=self.precision) + tho corr_exp.set_attributes(tag="corr_exp", debug=debug_multi) # _poly approximates log10(1+r)/r # _poly * red_vx approximates log10(x) m0h, m0l = Mul211(_red_vx, _poly) m0h, m0l = Add212(_red_vx, m0h, m0l) m0h.set_attributes(tag="m0h", debug=debug_multi) m0l.set_attributes(tag="m0l") l0_h = corr_exp * log2_hi l0_l = corr_exp * log2_lo l0_h.set_attributes(tag="l0_h") l0_l.set_attributes(tag="l0_l") rh, rl = Add222(l0_h, l0_l, m0h, m0l) rh.set_attributes(tag="rh0", debug=debug_multi) rl.set_attributes(tag="rl0", debug=debug_multi) rh, rl = Add222(-_log_inv_hi, -_log_inv_lo, rh, rl) rh.set_attributes(tag="rh", debug=debug_multi) rl.set_attributes(tag="rl", debug=debug_multi) if sollya.log(self.basis) != 1.0: lbh = self.precision.round_sollya_object( 1 / sollya.log(self.basis)) lbl = self.precision.round_sollya_object( 1 / sollya.log(self.basis) - lbh) rh, rl = Mul222(rh, rl, lbh, lbl) return rh else: return rh result = compute_log(vx) result.set_attributes(tag="result", debug=debug_multi) if False: # building eval error map eval_error_map = { red_vx: Variable("red_vx", precision=self.precision, interval=red_vx.get_interval()), log_inv_hi: Variable("log_inv_hi", precision=self.precision, interval=table_high_interval), log_inv_lo: Variable("log_inv_lo", precision=self.precision, interval=table_low_interval), corr_exp: Variable("corr_exp_g", precision=self.precision, interval=self.precision.get_exponent_interval()), } # computing gappa error if is_gappa_installed(): poly_eval_error = self.get_eval_error(result, eval_error_map) Log.report(Log.Info, "poly_eval_error: ", poly_eval_error) neg_input = Comparison(vx, 0, likely=False, specifier=Comparison.Less, debug=debug_multi, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debug_multi, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debug_multi, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debug_multi, tag="vx_subnormal") vx_zero = Test(vx, specifier=Test.IsZero, likely=False, debug=debug_multi, tag="vx_zero") exp_mone = Equal(vx_exp, -1, tag="exp_minus_one", debug=debug_multi, likely=False) # exp=-1 case Log.report(Log.Info, "managing exp=-1 case") #red_vx_2 = arg_red_index * vx_mant * 0.5 #approx_interval2 = Interval(0.5 - inv_err, 0.5 + inv_err) #poly_degree2 = sup(guessdegree(log(x), approx_interval2, S2**-(self.precision.get_field_size()+1))) + 1 #poly_object2 = Polynomial.build_from_approximation(log(sollya.x), poly_degree, [self.precision]*(poly_degree+1), approx_interval2, sollya.absolute) #print "poly_object2: ", poly_object2.get_sollya_object() #poly2 = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object2, red_vx_2, unified_precision = self.precision) #poly2.set_attributes(tag = "poly2", debug = debug_multi) #result2 = (poly2 - log_inv_hi - log_inv_lo) m100 = -100 S2100 = Constant(S2**100, precision=self.precision) result_subnormal = compute_log(vx * S2100, exp_corr_factor=m100) # main scheme Log.report(Log.Info, "MDL scheme") pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, ConditionBlock( vx_zero, Statement( ClearException(), Raise(ML_FPE_DivideByZero), Return(FP_MinusInfty(self.precision)), ), Return(result_subnormal)), Return(result)))) scheme = pre_scheme return scheme
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = self.implementation.add_input_variable("x", self.precision) Log.report(Log.Info, "target: %s " % self.processor.target_name) # display parameter information Log.report(Log.Info, "accuracy : %s " % self.accuracy) Log.report(Log.Info, "input interval: %s " % self.input_interval) accuracy_goal = self.accuracy.get_goal() Log.report(Log.Info, "accuracy_goal=%f" % accuracy_goal) table_size_log = self.table_size_log integer_size = 31 integer_precision = ML_Int32 max_bound = sup(abs(self.input_interval)) max_bound_log = int(ceil(log2(max_bound))) Log.report(Log.Info, "max_bound_log=%s " % max_bound_log) scaling_power = integer_size - max_bound_log Log.report(Log.Info, "scaling power: %s " % scaling_power) storage_precision = ML_Custom_FixedPoint_Format(1, 30, signed=True) Log.report(Log.Info, "tabulating cosine and sine") # cosine and sine fused table fused_table = ML_NewTable( dimensions=[2**table_size_log, 2], storage_precision=storage_precision, tag="fast_lib_shared_table") # self.uniquify_name("cossin_table")) # filling table for i in range(2**table_size_log): local_x = i / S2**table_size_log * S2**max_bound_log cos_local = cos( local_x ) # nearestint(cos(local_x) * S2**storage_precision.get_frac_size()) sin_local = sin( local_x ) # nearestint(sin(local_x) * S2**storage_precision.get_frac_size()) fused_table[i][0] = cos_local fused_table[i][1] = sin_local # argument reduction evaluation scheme # scaling_factor = Constant(S2**scaling_power, precision = self.precision) red_vx_precision = ML_Custom_FixedPoint_Format(31 - scaling_power, scaling_power, signed=True) Log.report( Log.Verbose, "red_vx_precision.get_c_bit_size()=%d" % red_vx_precision.get_c_bit_size()) # red_vx = NearestInteger(vx * scaling_factor, precision = integer_precision) red_vx = Conversion(vx, precision=red_vx_precision, tag="red_vx", debug=debug_fixed32) computation_precision = red_vx_precision # self.precision output_precision = self.io_precisions[0] Log.report(Log.Info, "computation_precision is %s" % computation_precision) Log.report(Log.Info, "storage_precision is %s" % storage_precision) Log.report(Log.Info, "output_precision is %s" % output_precision) hi_mask_value = 2**32 - 2**(32 - table_size_log - 1) hi_mask = Constant(hi_mask_value, precision=ML_Int32) Log.report(Log.Info, "hi_mask=0x%x" % hi_mask_value) red_vx_hi_int = BitLogicAnd(TypeCast(red_vx, precision=ML_Int32), hi_mask, precision=ML_Int32, tag="red_vx_hi_int", debug=debugd) red_vx_hi = TypeCast(red_vx_hi_int, precision=red_vx_precision, tag="red_vx_hi", debug=debug_fixed32) red_vx_lo = red_vx - red_vx_hi red_vx_lo.set_attributes(precision=red_vx_precision, tag="red_vx_lo", debug=debug_fixed32) table_index = BitLogicRightShift(TypeCast(red_vx, precision=ML_Int32), scaling_power - (table_size_log - max_bound_log), precision=ML_Int32, tag="table_index", debug=debugd) tabulated_cos = TableLoad(fused_table, table_index, 0, tag="tab_cos", precision=storage_precision, debug=debug_fixed32) tabulated_sin = TableLoad(fused_table, table_index, 1, tag="tab_sin", precision=storage_precision, debug=debug_fixed32) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) Log.report(Log.Info, "building polynomial approximation for cosine") # cosine polynomial approximation poly_interval = Interval(0, S2**(max_bound_log - table_size_log)) Log.report(Log.Info, "poly_interval=%s " % poly_interval) cos_poly_degree = 2 # int(sup(guessdegree(cos(x), poly_interval, accuracy_goal))) Log.report(Log.Verbose, "cosine polynomial approximation") cos_poly_object, cos_approx_error = Polynomial.build_from_approximation_with_error( cos(x), [0, 2], [0] + [computation_precision.get_bit_size()], poly_interval, sollya.absolute, error_function=error_function) #cos_eval_scheme = PolynomialSchemeEvaluator.generate_horner_scheme(cos_poly_object, red_vx_lo, unified_precision = computation_precision) Log.report(Log.Info, "cos_approx_error=%e" % cos_approx_error) cos_coeff_list = cos_poly_object.get_ordered_coeff_list() coeff_C0 = cos_coeff_list[0][1] coeff_C2 = Constant(cos_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) Log.report(Log.Info, "building polynomial approximation for sine") # sine polynomial approximation sin_poly_degree = 2 # int(sup(guessdegree(sin(x)/x, poly_interval, accuracy_goal))) Log.report(Log.Info, "sine poly degree: %e" % sin_poly_degree) Log.report(Log.Verbose, "sine polynomial approximation") sin_poly_object, sin_approx_error = Polynomial.build_from_approximation_with_error( sin(sollya.x) / sollya.x, [0, 2], [0] + [computation_precision.get_bit_size()] * (sin_poly_degree + 1), poly_interval, sollya.absolute, error_function=error_function) sin_coeff_list = sin_poly_object.get_ordered_coeff_list() coeff_S0 = sin_coeff_list[0][1] coeff_S2 = Constant(sin_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) # scheme selection between sine and cosine if self.cos_output: scheme = self.generate_cos_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) else: scheme = self.generate_sin_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) result = Conversion(scheme, precision=self.io_precisions[0]) Log.report( Log.Verbose, "result operation tree :\n %s " % result.get_str( display_precision=True, depth=None, memoization_map={})) scheme = Statement(Return(result)) return scheme
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) index_size = 3 approx_interval = Interval(0.0, 2**-index_size) error_goal_approx = 2**-(self.precision.get_precision()) int_precision = self.precision.get_integer_format() vx_int = Floor(vx * 2**index_size, precision=self.precision, tag="vx_int", debug=debug_multi) vx_frac = vx - (vx_int * 2**-index_size) vx_frac.set_attributes(tag="vx_frac", debug=debug_multi, unbreakable=True) poly_degree = sup( guessdegree(2**(sollya.x), approx_interval, error_goal_approx)) + 1 precision_list = [1] + [self.precision] * (poly_degree) vx_integer = Conversion(vx_int, precision=int_precision, tag="vx_integer", debug=debug_multi) vx_int_hi = BitLogicRightShift(vx_integer, Constant(index_size), tag="vx_int_hi", debug=debug_multi) vx_int_lo = Modulo(vx_integer, 2**index_size, tag="vx_int_lo", debug=debug_multi) pow_exp = ExponentInsertion(Conversion(vx_int_hi, precision=int_precision), precision=self.precision, tag="pow_exp", debug=debug_multi) exp2_table = ML_NewTable(dimensions=[2 * 2**index_size, 2], storage_precision=self.precision, tag=self.uniquify_name("exp2_table")) for i in range(2 * 2**index_size): input_value = i - 2**index_size if i >= 2**index_size else i exp2_value = SollyaObject(2)**((input_value) * 2**-index_size) hi_value = round(exp2_value, self.precision.get_sollya_object(), RN) lo_value = round(exp2_value - hi_value, self.precision.get_sollya_object(), RN) exp2_table[i][0] = lo_value exp2_table[i][1] = hi_value error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( 2**(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) print "poly_approx_error: ", poly_approx_error, float( log2(poly_approx_error)) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly = polynomial_scheme_builder(poly_object.sub_poly(start_index=1), vx_frac, unified_precision=self.precision) poly.set_attributes(tag="poly", debug=debug_multi) table_index = Addition(vx_int_lo, Constant(2**index_size, precision=int_precision), precision=int_precision, tag="table_index", debug=debug_multi) lo_value_load = TableLoad(exp2_table, table_index, 0, tag="lo_value_load", debug=debug_multi) hi_value_load = TableLoad(exp2_table, table_index, 1, tag="hi_value_load", debug=debug_multi) result = (hi_value_load + (hi_value_load * poly + (lo_value_load + lo_value_load * poly))) * pow_exp ov_flag = Comparison(vx_int_hi, Constant(self.precision.get_emax(), precision=self.precision), specifier=Comparison.Greater) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = Statement( Return(Select(ov_flag, FP_PlusInfty(self.precision), result))) return scheme
def generate_scheme(self): def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = VirtualFormat(base_format=self.precision, support_format=ML_StdLogicVectorFormat( self.precision.get_bit_size()), get_cst=get_virtual_cst) # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) # rounding mode input rnd_mode = self.implementation.add_input_signal( "rnd_mode", rnd_mode_format) if self.pipelined: self.implementation.add_input_signal("reset", ML_StdLogic) vx_precision = self.precision p = vx_precision.get_mantissa_size() exp_size = vx_precision.get_exponent_size() exp_vx_precision = ML_StdLogicVectorFormat( vx_precision.get_exponent_size()) mant_vx_precision = ML_StdLogicVectorFormat(p) # fixed-point precision for operand's exponent exp_fixed_precision = fixed_point(exp_size, 0, signed=False) # mantissa extraction mant_vx = TypeCast(MantissaExtraction(vx, precision=mant_vx_precision, tag="extracted_mantissa"), precision=fixed_point(1, p - 1, signed=False), debug=debug_fixed, tag="mant_vx") # exponent extraction exp_vx = TypeCast(RawExponentExtraction(vx, precision=exp_vx_precision, tag="exp_vx"), precision=exp_fixed_precision) approx_index_size = 8 approx_precision = fixed_point( 2, approx_index_size, ) # selecting table index from input mantissa MSBs tab_index = SubSignalSelection(mant_vx, p - 2 - approx_index_size + 1, p - 2, tag="tab_index") # declaring reciprocal approximation table inv_approx_table = ML_NewTable(dimensions=[2**approx_index_size], storage_precision=approx_precision, tag="inv_approx_table") for i in range(2**approx_index_size): num_input = 1 + i * S2**-approx_index_size table_value = io_precision.get_base_format().round_sollya_object( 1 / num_input) inv_approx_table[i] = table_value # extracting initial reciprocal approximation inv_approx_value = TableLoad(inv_approx_table, tab_index, precision=approx_precision, tag="inv_approx_value", debug=debug_fixed) #inv_approx_value = TypeCast(inv_approx_value, precision = approx_precision) pre_it0_input = zext( SubSignalSelection(mant_vx, p - 1 - approx_index_size, p - 1, tag="it0_input"), 1) it0_input = TypeCast(pre_it0_input, precision=approx_precision, tag="it0_input", debug=debug_fixed) it1_precision = RTL_FixedPointFormat( 2, 2 * approx_index_size, support_format=ML_StdLogicVectorFormat(2 + 2 * approx_index_size)) it1_input = mant_vx final_approx = generate_NR_iteration( mant_vx, inv_approx_value, (2, approx_index_size * 2), # mult precision (-3, 2 * approx_index_size), # error precision (2, approx_index_size * 3), # new-approx mult (2, approx_index_size * 2), # new approx precision self.implementation, pipelined=0, #1 if self.pipelined else 0, tag_suffix="_first") # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() final_approx = generate_NR_iteration( mant_vx, final_approx, # mult precision (2, approx_index_size * 3), # error precision (-6, approx_index_size * 3), # approx mult precision (2, approx_index_size * 3), # new approx precision (2, approx_index_size * 3), self.implementation, pipelined=1 if self.pipelined else 0, tag_suffix="_second") # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() final_approx = generate_NR_iteration( mant_vx, final_approx, # mult-precision (2, 2 * p - 1), # error precision (-(3 * approx_index_size) / 2, approx_index_size * 2 + p - 1), # mult approx mult precision (2, approx_index_size * 2 + p - 1), # approx precision (2, p), self.implementation, pipelined=2 if self.pipelined else 0, tag_suffix="_third") # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() final_approx = generate_NR_iteration( mant_vx, final_approx, (2, 2 * p), (-(4 * p) / 5, 2 * p), (2, 2 * p), (2, 2 * p), self.implementation, pipelined=2 if self.pipelined else 0, tag_suffix="_last") # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() final_approx.set_attributes(tag="final_approx", debug=debug_hex) last_approx_norm = final_approx offset_bit = BitSelection(last_approx_norm, FixedPointPosition( last_approx_norm, 0, align=FixedPointPosition.FromPointToLSB), tag="offset_bit", debug=debug_std) # extracting bit to determine if result should be left-shifted and # exponent incremented not_decrement = offset_bit final_approx_reduced = SubSignalSelection( final_approx, FixedPointPosition(final_approx, -(p - 1), align=FixedPointPosition.FromPointToLSB), FixedPointPosition(final_approx, 0, align=FixedPointPosition.FromPointToLSB), precision=fixed_point(p, 0, signed=False)) final_approx_reduced_shifted = SubSignalSelection( final_approx, FixedPointPosition(final_approx, -p, align=FixedPointPosition.FromPointToLSB), FixedPointPosition(final_approx, -1, align=FixedPointPosition.FromPointToLSB), precision=fixed_point(p, 0, signed=False)) # unrounded mantissa field excluding leading digit unrounded_mant_field = Select( equal_to(not_decrement, 1), final_approx_reduced, final_approx_reduced_shifted, precision=fixed_point(p, 0, signed=False), tag="unrounded_mant_field", debug=debug_hex, ) def get_bit(optree, bit_index): bit_sel = BitSelection( optree, FixedPointPosition(optree, -bit_index, align=FixedPointPosition.FromPointToLSB)) return bit_sel mant_lsb = Select( equal_to(not_decrement, 1), get_bit(final_approx, p - 1), get_bit(final_approx, p), precision=ML_StdLogic, tag="mant_lsb", debug=debug_std, ) round_bit = Select( equal_to(not_decrement, 1), get_bit(final_approx, p), get_bit(final_approx, p + 1), precision=ML_StdLogic, tag="round_bit", debug=debug_std, ) sticky_bit_input = Select( equal_to(not_decrement, 1), SubSignalSelection(final_approx, 0, FixedPointPosition( final_approx, -(p + 1), align=FixedPointPosition.FromPointToLSB), precision=None, tag="sticky_bit_input"), SubSignalSelection(final_approx, 0, FixedPointPosition( final_approx, -(p + 2), align=FixedPointPosition.FromPointToLSB), precision=None, tag="sticky_bit_input"), ) sticky_bit = Select(Equal(sticky_bit_input, Constant(0, precision=None)), Constant(0, precision=ML_StdLogic), Constant(1, precision=ML_StdLogic), precision=ML_StdLogic, tag="sticky_bit", debug=debug_std) # TODO: manage leading digit (in case of subnormal result) pre_result = unrounded_mant_field # real_exp = exp_vx - bias # - real_exp = bias - exp_vx # encoded negated exp = bias - exp_vx + bias = 2 * bias - exp_vx fp_io_precision = io_precision.get_base_format() neg_exp = -2 * fp_io_precision.get_bias() - exp_vx neg_exp.set_attributes(tag="neg_exp", debug=debug_fixed) res_exp = Subtraction(neg_exp, Select(equal_to(not_decrement, 1), Constant(0, precision=exp_fixed_precision), Constant(1, precision=exp_fixed_precision), precision=None, tag="exp_offset", debug=debug_fixed), tag="res_exp", debug=debug_fixed) res_exp_field = SubSignalSelection( res_exp, FixedPointPosition(res_exp, 0, align=FixedPointPosition.FromPointToLSB, tag="res_exp_field LSB"), FixedPointPosition(res_exp, exp_size - 1, align=FixedPointPosition.FromPointToLSB, tag="res_exp_field MSB"), precision=None, tag="res_exp_field", # debug=debug_fixed ) result_sign = CopySign(vx, precision=ML_StdLogic) exp_mant_precision = ML_StdLogicVectorFormat( io_precision.get_bit_size() - 1) rnd_mode_is_rne = Equal(rnd_mode, rnd_rne, precision=ML_Bool) rnd_mode_is_ru = Equal(rnd_mode, rnd_ru, precision=ML_Bool) rnd_mode_is_rd = Equal(rnd_mode, rnd_rd, precision=ML_Bool) rnd_mode_is_rz = Equal(rnd_mode, rnd_rz, precision=ML_Bool) round_incr = Conversion( logical_or_reduce([ logical_and_reduce([ rnd_mode_is_rne, equal_to(round_bit, 1), equal_to(sticky_bit, 1) ]), logical_and_reduce([ rnd_mode_is_rne, equal_to(round_bit, 1), equal_to(sticky_bit, 0), equal_to(mant_lsb, 1) ]), logical_and_reduce([ rnd_mode_is_ru, equal_to(result_sign, 0), LogicalOr(equal_to(round_bit, 1), equal_to(sticky_bit, 1), precision=ML_Bool) ]), logical_and_reduce([ rnd_mode_is_rd, equal_to(result_sign, 1), LogicalOr(equal_to(round_bit, 1), equal_to(sticky_bit, 1), precision=ML_Bool) ]), ]), precision=fixed_point(1, 0, signed=False), tag="round_incr", #debug=debug_fixed ) # Precision for result without sign unsigned_result_prec = fixed_point((p - 1) + exp_size, 0) unrounded_mant_field_nomsb = Conversion( unrounded_mant_field, precision=fixed_point(p - 1, 0, signed=False), tag="unrounded_mant_field_nomsb", debug=debug_hex) pre_rounded_unsigned_result = Concatenation( res_exp_field, unrounded_mant_field_nomsb, precision=unsigned_result_prec, tag="pre_rounded_unsigned_result") unsigned_result_rounded = Addition(pre_rounded_unsigned_result, round_incr, precision=unsigned_result_prec, tag="unsigned_result") vr_out = TypeCast(Concatenation( result_sign, TypeCast(unsigned_result_rounded, precision=ML_StdLogicVectorFormat(p - 1 + exp_size)), precision=ML_StdLogicVectorFormat(io_precision.get_bit_size())), precision=io_precision, debug=debug_hex, tag="vr_out") self.implementation.add_output_signal("vr_out", vr_out) return [self.implementation]
def generate_scheme(self): def get_virtual_cst(prec, value, language): return prec.get_support_format().get_cst( prec.get_base_format().get_integer_coding(value, language)) ## convert @p value from an input floating-point precision # @p in_precision to an output support format @p out_precision io_precision = HdlVirtualFormat(self.precision) # declaring main input variable vx = self.implementation.add_input_signal("x", io_precision) if self.pipelined: self.implementation.add_input_signal("reset", ML_StdLogic) vx_precision = self.precision p = vx_precision.get_mantissa_size() exp_vx_precision = ML_StdLogicVectorFormat(vx_precision.get_exponent_size()) mant_vx_precision = ML_StdLogicVectorFormat(p) # mantissa extraction mant_vx = MantissaExtraction(vx, precision = mant_vx_precision, tag = "mant_vx") # exponent extraction exp_vx = RawExponentExtraction(vx, precision = exp_vx_precision, tag = "exp_vx", debug = debug_dec) approx_index_size = 8 approx_precision = RTL_FixedPointFormat( 2, approx_index_size, support_format = ML_StdLogicVectorFormat(approx_index_size + 2), ) # selecting table index from input mantissa MSBs tab_index = SubSignalSelection(mant_vx, p-2 - approx_index_size +1, p-2, tag = "tab_index") # declaring reciprocal approximation table inv_approx_table = ML_NewTable(dimensions = [2**approx_index_size], storage_precision = approx_precision, tag = "inv_approx_table") for i in range(2**approx_index_size): num_input = 1 + i * S2**-approx_index_size table_value = io_precision.get_base_format().round_sollya_object(1 / num_input) inv_approx_table[i] = table_value # extracting initial reciprocal approximation inv_approx_value = TableLoad(inv_approx_table, tab_index, precision = approx_precision, tag = "inv_approx_value", debug = debug_fixed) #inv_approx_value = TypeCast(inv_approx_value, precision = approx_precision) pre_it0_input = zext(SubSignalSelection(mant_vx, p-1 - approx_index_size , p-1, tag = "it0_input"), 1) it0_input = TypeCast(pre_it0_input, precision = approx_precision, tag = "it0_input", debug = debug_fixed) it1_precision = RTL_FixedPointFormat( 2, 2 * approx_index_size, support_format = ML_StdLogicVectorFormat(2 + 2 * approx_index_size) ) pre_it1_input = zext(SubSignalSelection(mant_vx, p - 1 - 2 * approx_index_size, p -1, tag = "it1_input"), 1) it1_input = TypeCast(pre_it1_input, precision = it1_precision, tag = "it1_input", debug = debug_fixed) final_approx = generate_NR_iteration( it0_input, inv_approx_value, (2, approx_index_size * 2), # mult precision (-3, 2 * approx_index_size), # error precision (2, approx_index_size * 3), # new-approx mult (2, approx_index_size * 2), # new approx precision self.implementation, pipelined = 0, #1 if self.pipelined else 0, tag_suffix = "_first" ) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() final_approx = generate_NR_iteration( it1_input, final_approx, # mult precision (2, approx_index_size * 3), # error precision (-6, approx_index_size * 3), # approx mult precision (2, approx_index_size * 3), # new approx precision (2, approx_index_size * 3), self.implementation, pipelined = 1 if self.pipelined else 0, tag_suffix = "_second" ) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() last_it_precision = RTL_FixedPointFormat( 2, p - 1, support_format=ML_StdLogicVectorFormat(2 + p - 1) ) pre_last_it_input = zext(mant_vx, 1) last_it_input = TypeCast( pre_last_it_input, precision=last_it_precision, tag="last_it_input", debug=debug_fixed ) final_approx = generate_NR_iteration( last_it_input, final_approx, # mult-precision (2, 2 * p - 1), # error precision (int(- (3 * approx_index_size) / 2), approx_index_size * 2 + p - 1), # mult approx mult precision (2, approx_index_size * 2 + p - 1), # approx precision (2, p), self.implementation, pipelined = 2 if self.pipelined else 0, tag_suffix = "_third" ) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() final_approx = generate_NR_iteration( last_it_input, final_approx, (2, 2 * p), (int(-(4 * p)/5), 2 * p), (2, 2 * p), (2, 2 * p), self.implementation, pipelined = 2 if self.pipelined else 0, tag_suffix = "_last" ) # Inserting post-input pipeline stage if self.pipelined: self.implementation.start_new_stage() final_approx.set_attributes(tag = "final_approx", debug = debug_fixed) # bit indexes to select mantissa from final_approximation pre_mant_size = min(self.precision.get_field_size(), final_approx.get_precision().get_frac_size()) final_approx_frac_msb_index = final_approx.get_precision().get_frac_size() - 1 final_approx_frac_lsb_index = final_approx.get_precision().get_frac_size() - pre_mant_size # extracting bit to determine if result should be left-shifted and # exponent incremented cst_index = Constant(final_approx.get_precision().get_frac_size(), precision = ML_Integer) final_approx_casted = TypeCast(final_approx, precision = ML_StdLogicVectorFormat(final_approx.get_precision().get_bit_size())) not_decrement = final_approx_casted[cst_index] not_decrement.set_attributes(precision = ML_StdLogic, tag = "not_decrement", debug = debug_std) logic_1 = Constant(1, precision = ML_StdLogic) result = Select( Comparison( not_decrement, logic_1, specifier = Comparison.Equal, precision = ML_Bool), SubSignalSelection( TypeCast( final_approx, precision = ML_StdLogicVectorFormat(final_approx.get_precision().get_bit_size()) ), final_approx_frac_lsb_index, final_approx_frac_msb_index, ), SubSignalSelection( TypeCast( final_approx, precision = ML_StdLogicVectorFormat(final_approx.get_precision().get_bit_size()) ), final_approx_frac_lsb_index - 1, final_approx_frac_msb_index - 1, ), precision = ML_StdLogicVectorFormat(pre_mant_size), tag = "result" ) def get_bit(optree, bit_index): bit_index_cst = Constant(bit_index, precision = ML_Integer) bit_sel = VectorElementSelection( optree, bit_index_cst, precision = ML_StdLogic) return bit_sel least_bit = Select( Comparison(not_decrement, logic_1, specifier = Comparison.Equal, precision = ML_Bool), get_bit(final_approx_casted, final_approx_frac_lsb_index), get_bit(final_approx_casted, final_approx_frac_lsb_index - 1), precision = ML_StdLogic, tag = "least_bit", debug = debug_std, ) round_bit = Select( Comparison(not_decrement, logic_1, specifier = Comparison.Equal, precision = ML_Bool), get_bit(final_approx_casted, final_approx_frac_lsb_index - 1), get_bit(final_approx_casted, final_approx_frac_lsb_index - 2), precision = ML_StdLogic, tag = "round_bit", debug = debug_std, ) sticky_bit_input = Select( Comparison(not_decrement, logic_1, specifier = Comparison.Equal, precision = ML_Bool), SubSignalSelection( final_approx_casted, 0, final_approx_frac_lsb_index - 2, precision = ML_StdLogicVectorFormat(final_approx_frac_lsb_index - 1) ), zext( SubSignalSelection( final_approx_casted, 0, final_approx_frac_lsb_index - 3, precision = ML_StdLogicVectorFormat(final_approx_frac_lsb_index - 2) ), 1 ), precision = ML_StdLogicVectorFormat(final_approx_frac_lsb_index - 1) ) sticky_bit = Select( Equal( sticky_bit_input, Constant(0, precision = ML_StdLogicVectorFormat(final_approx_frac_lsb_index - 1)) ), Constant(0, precision = ML_StdLogic), Constant(1, precision = ML_StdLogic), precision = ML_StdLogic, tag = "sticky_bit", debug = debug_std ) # if mantissa require extension if pre_mant_size < self.precision.get_mantissa_size() - 1: result = rzext(result, self.precision.get_mantissa_size() - 1 - pre_mant_size) res_mant_field = result # real_exp = exp_vx - bias # - real_exp = bias - exp_vx # encoded negated exp = bias - exp_vx + bias = 2 * bias - exp_vx fp_io_precision = io_precision.get_base_format() exp_op_precision = ML_StdLogicVectorFormat(fp_io_precision.get_exponent_size() + 2) biasX2 = Constant(- 2 * fp_io_precision.get_bias(), precision = exp_op_precision) neg_exp = Subtraction( SignCast( biasX2, specifier = SignCast.Unsigned, precision = get_unsigned_precision(exp_op_precision) ), SignCast( zext(exp_vx, 2), specifier = SignCast.Unsigned, precision = get_unsigned_precision(exp_op_precision), ), precision = exp_op_precision, tag = "neg_exp", debug = debug_dec ) neg_exp_field = SubSignalSelection( neg_exp, 0, fp_io_precision.get_exponent_size() - 1, precision = ML_StdLogicVectorFormat(fp_io_precision.get_exponent_size()) ) res_exp = Addition( SignCast( neg_exp_field, precision = get_unsigned_precision(exp_vx.get_precision()), specifier = SignCast.Unsigned ), SignCast( Select( Comparison(not_decrement, logic_1, specifier = Comparison.Equal, precision = ML_Bool), Constant(0, precision = exp_vx_precision), Constant(-1, precision = exp_vx_precision), precision = exp_vx_precision ), precision = get_unsigned_precision(exp_vx_precision), specifier = SignCast.Unsigned ), precision = exp_vx_precision, tag = "result_exp", debug = debug_dec ) res_sign = CopySign(vx, precision = ML_StdLogic) exp_mant_precision = ML_StdLogicVectorFormat(io_precision.get_bit_size() - 1) round_incr = Select( LogicalAnd( Equal(round_bit, Constant(1, precision = ML_StdLogic)), LogicalOr( Equal(sticky_bit, Constant(1, precision = ML_StdLogic)), Equal(least_bit, Constant(1, precision = ML_StdLogic)), precision = ML_Bool, ), precision = ML_Bool, ), Constant(1, precision = ML_StdLogic), Constant(0, precision = ML_StdLogic), tag = "round_incr", precision = ML_StdLogic, debug = debug_std ) exp_mant = Concatenation( res_exp, res_mant_field, precision = exp_mant_precision ) exp_mant_rounded = Addition( SignCast( exp_mant, SignCast.Unsigned, precision = get_unsigned_precision(exp_mant_precision) ), round_incr, precision = exp_mant_precision, tag = "exp_mant_rounded" ) vr_out = TypeCast( Concatenation( res_sign, exp_mant_rounded, precision = ML_StdLogicVectorFormat(io_precision.get_bit_size()) ), precision = io_precision, debug = debug_hex, tag = "vr_out" ) self.implementation.add_output_signal("vr_out", vr_out) return [self.implementation]
def generate_scalar_scheme(self, vx): Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") index_size = 5 comp_lo = (vx < 0) comp_lo.set_attributes(tag = "comp_lo", precision = ML_Bool) sign = Select(comp_lo, -1, 1, precision = self.precision) # as sinh is an odd function, we can simplify the input to its absolute # value once the sign has been extracted vx = Abs(vx) int_precision = self.precision.get_integer_format() # argument reduction arg_reg_value = log(2)/2**index_size inv_log2_value = round(1/arg_reg_value, self.precision.get_sollya_object(), sollya.RN) inv_log2_cst = Constant(inv_log2_value, precision = self.precision, tag = "inv_log2") # for r_hi to be accurate we ensure k * log2_hi_value_cst is exact # by limiting the number of non-zero bits in log2_hi_value_cst # cosh(x) ~ exp(abs(x))/2 for a big enough x # cosh(x) > 2^1023 <=> exp(x) > 2^1024 <=> x > log(2^1024) # k = inv_log2_value * x # -1 for guard max_k_approx = inv_log2_value * log(sollya.SollyaObject(2)**1024) max_k_bitsize = int(ceil(log2(max_k_approx))) Log.report(Log.Info, "max_k_bitsize: %d" % max_k_bitsize) log2_hi_value_precision = self.precision.get_precision() - max_k_bitsize - 1 log2_hi_value = round(arg_reg_value, log2_hi_value_precision, sollya.RN) log2_lo_value = round(arg_reg_value - log2_hi_value, self.precision.get_sollya_object(), sollya.RN) log2_hi_value_cst = Constant(log2_hi_value, tag = "log2_hi_value", precision = self.precision) log2_lo_value_cst = Constant(log2_lo_value, tag = "log2_lo_value", precision = self.precision) k = Trunc(Multiplication(inv_log2_cst, vx), precision = self.precision) k_log2 = Multiplication(k, log2_hi_value_cst, precision = self.precision, exact = True, tag = "k_log2", unbreakable = True) r_hi = vx - k_log2 r_hi.set_attributes(tag = "r_hi", debug = debug_multi, unbreakable = True) r_lo = -k * log2_lo_value_cst # reduced argument r = r_hi + r_lo r.set_attributes(tag = "r", debug = debug_multi) if is_gappa_installed(): r_eval_error = self.get_eval_error(r_hi, variable_copy_map = { vx: Variable("vx", interval = Interval(0, 715), precision = self.precision), k: Variable("k", interval = Interval(0, 1024), precision = self.precision) }) Log.report(Log.Verbose, "r_eval_error: ", r_eval_error) approx_interval = Interval(-arg_reg_value, arg_reg_value) error_goal_approx = 2**-(self.precision.get_precision()) poly_degree = sup(guessdegree(exp(sollya.x), approx_interval, error_goal_approx)) + 3 precision_list = [1] + [self.precision] * (poly_degree) k_integer = Conversion(k, precision = int_precision, tag = "k_integer", debug = debug_multi) k_hi = BitLogicRightShift(k_integer, Constant(index_size, precision=int_precision), tag = "k_int_hi", precision = int_precision, debug = debug_multi) k_lo = Modulo(k_integer, 2**index_size, tag = "k_int_lo", precision = int_precision, debug = debug_multi) pow_exp = ExponentInsertion(Conversion(k_hi, precision = int_precision), precision = self.precision, tag = "pow_exp", debug = debug_multi) exp_table = ML_NewTable(dimensions = [2 * 2**index_size, 4], storage_precision = self.precision, tag = self.uniquify_name("exp2_table")) for i in range(2 * 2**index_size): input_value = i - 2**index_size if i >= 2**index_size else i reduced_hi_prec = int(self.precision.get_mantissa_size() - 8) # using SollyaObject wrapper to force evaluation by sollya # with higher precision exp_value = sollya.SollyaObject(2)**((input_value)* 2**-index_size) mexp_value = sollya.SollyaObject(2)**((-input_value)* 2**-index_size) pos_value_hi = round(exp_value, reduced_hi_prec, sollya.RN) pos_value_lo = round(exp_value - pos_value_hi, self.precision.get_sollya_object(), sollya.RN) neg_value_hi = round(mexp_value, reduced_hi_prec, sollya.RN) neg_value_lo = round(mexp_value - neg_value_hi, self.precision.get_sollya_object(), sollya.RN) exp_table[i][0] = neg_value_hi exp_table[i][1] = neg_value_lo exp_table[i][2] = pos_value_hi exp_table[i][3] = pos_value_lo # log2_value = log(2) / 2^index_size # sinh(x) = 1/2 * (exp(x) - exp(-x)) # exp(x) = exp(x - k * log2_value + k * log2_value) # # r = x - k * log2_value # exp(x) = exp(r) * 2 ^ (k / 2^index_size) # # k / 2^index_size = h + l * 2^-index_size, with k, h, l integers # exp(x) = exp(r) * 2^h * 2^(l *2^-index_size) # # sinh(x) = exp(r) * 2^(h-1) * 2^(l *2^-index_size) - exp(-r) * 2^(-h-1) * 2^(-l *2^-index_size) # S=2^(h-1), T = 2^(-h-1) # exp(r) = 1 + poly_pos(r) # exp(-r) = 1 + poly_neg(r) # 2^(l / 2^index_size) = pos_value_hi + pos_value_lo # 2^(-l / 2^index_size) = neg_value_hi + neg_value_lo # error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error(exp(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function = error_function) Log.report(Log.Verbose, "poly_approx_error: {}, {}".format(poly_approx_error, float(log2(poly_approx_error)))) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme poly_pos = polynomial_scheme_builder(poly_object.sub_poly(start_index = 1), r, unified_precision = self.precision) poly_pos.set_attributes(tag = "poly_pos", debug = debug_multi) poly_neg = polynomial_scheme_builder(poly_object.sub_poly(start_index = 1), -r, unified_precision = self.precision) poly_neg.set_attributes(tag = "poly_neg", debug = debug_multi) table_index = Addition(k_lo, Constant(2**index_size, precision = int_precision), precision = int_precision, tag = "table_index", debug = debug_multi) neg_value_load_hi = TableLoad(exp_table, table_index, 0, tag = "neg_value_load_hi", debug = debug_multi) neg_value_load_lo = TableLoad(exp_table, table_index, 1, tag = "neg_value_load_lo", debug = debug_multi) pos_value_load_hi = TableLoad(exp_table, table_index, 2, tag = "pos_value_load_hi", debug = debug_multi) pos_value_load_lo = TableLoad(exp_table, table_index, 3, tag = "pos_value_load_lo", debug = debug_multi) k_plus = Max( Subtraction(k_hi, Constant(1, precision = int_precision), precision=int_precision, tag="k_plus", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision = int_precision)) k_neg = Max( Subtraction(-k_hi, Constant(1, precision=int_precision), precision=int_precision, tag="k_neg", debug=debug_multi), Constant(self.precision.get_emin_normal(), precision = int_precision)) # 2^(h-1) pow_exp_pos = ExponentInsertion(k_plus, precision = self.precision, tag="pow_exp_pos", debug=debug_multi) # 2^(-h-1) pow_exp_neg = ExponentInsertion(k_neg, precision = self.precision, tag="pow_exp_neg", debug=debug_multi) hi_terms = (pos_value_load_hi * pow_exp_pos - neg_value_load_hi * pow_exp_neg) hi_terms.set_attributes(tag = "hi_terms", debug=debug_multi) pos_exp = (pos_value_load_hi * poly_pos + (pos_value_load_lo + pos_value_load_lo * poly_pos)) * pow_exp_pos pos_exp.set_attributes(tag = "pos_exp", debug = debug_multi) neg_exp = (neg_value_load_hi * poly_neg + (neg_value_load_lo + neg_value_load_lo * poly_neg)) * pow_exp_neg neg_exp.set_attributes(tag = "neg_exp", debug = debug_multi) result = Addition( Subtraction( pos_exp, neg_exp, precision=self.precision, ), hi_terms, precision=self.precision, tag="result", debug=debug_multi ) # ov_value ov_value = round(asinh(self.precision.get_max_value()), self.precision.get_sollya_object(), sollya.RD) ov_flag = Comparison(Abs(vx), Constant(ov_value, precision = self.precision), specifier = Comparison.Greater) # main scheme scheme = Statement( Return( Select( ov_flag, sign*FP_PlusInfty(self.precision), sign*result ))) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.precision) sollya_precision = self.get_input_precision().sollya_object # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) # 2-limb approximation of log(2) # hi part precision is reduced to provide exact operation # when multiplied by an exponent value log2_hi_value = round(log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round(log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision=self.precision) log2_lo = Constant(log2_lo_value, precision=self.precision) int_precision = self.precision.get_integer_format() # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision = self.precision) dummy_rcp_seed = ReciprocalSeed(dummy_var, precision = self.precision) inv_approx_table = self.processor.get_recursive_implementation(dummy_rcp_seed, language = None, table_getter = lambda self: self.approx_table_map) # table creation table_index_size = inv_approx_table.index_size log_table = ML_NewTable(dimensions = [2**table_index_size, 2], storage_precision = self.precision) # storing accurate logarithm approximation of value returned # by the fast reciprocal operation for i in range(0, 2**table_index_size): inv_value = inv_approx_table[i] value_high = round(log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round(log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low neg_input = Comparison(vx, -1, likely=False, precision=ML_Bool, specifier=Comparison.Less, debug=debug_multi, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, precision=ML_Bool, debug=debug_multi, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debug_multi, tag="snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debug_multi, tag="inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debug_multi, tag="vx_subnormal") # for x = m.2^e, such that e >= 0 # # log(1+x) = log(1 + m.2^e) # = log(2^e . 2^-e + m.2^e) # = log(2^e . (2^-e + m)) # = log(2^e) + log(2^-e + m) # = e . log(2) + log (2^-e + m) # # t = (2^-e + m) # t = m_t . 2^e_t # r ~ 1 / m_t => r.m_t ~ 1 ~ 0 # # t' = t . 2^-e_t # = 2^-e-e_t + m . 2^-e_t # # if e >= 0, then 2^-e <= 1, then 1 <= m + 2^-e <= 3 # r = m_r . 2^e_r # # log(1+x) = e.log(2) + log(r . 2^e_t . 2^-e_t . (2^-e + m) / r) # = e.log(2) + log(r . 2^(-e-e_t) + r.m.2^-e_t) + e_t . log(2)- log(r) # = (e+e_t).log(2) + log(r . t') - log(r) # = (e+e_t).log(2) + log(r . t') - log(r) # = (e+e_t).log(2) + P_log1p(r . t' - 1) - log(r) # # # argument reduction m = MantissaExtraction(vx, tag="vx", precision=self.precision, debug=debug_multi) e = ExponentExtraction(vx, tag="e", precision=int_precision, debug=debug_multi) # 2^-e TwoMinusE = ExponentInsertion(-e, tag="Two_minus_e", precision=self.precision, debug=debug_multi) t = Addition(TwoMinusE, m, precision=self.precision, tag="t", debug=debug_multi) m_t = MantissaExtraction(t, tag="m_t", precision=self.precision, debug=debug_multi) e_t = ExponentExtraction(t, tag="e_t", precision=int_precision, debug=debug_multi) # 2^(-e-e_t) TwoMinusEEt = ExponentInsertion(-e-e_t, tag="Two_minus_e_et", precision=self.precision) TwoMinusEt = ExponentInsertion(-e_t, tag="Two_minus_et", precision=self.precision, debug=debug_multi) rcp_mt = ReciprocalSeed(m_t, tag="rcp_mt", precision=self.precision, debug=debug_multi) INDEX_SIZE = table_index_size table_index = generic_mantissa_msb_index_fct(INDEX_SIZE, m_t) table_index.set_attributes(tag="table_index", debug=debug_multi) log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_multi) log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_multi) inv_err = S2**-6 # TODO: link to target DivisionSeed precision Log.report(Log.Info, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) approx_fct = sollya.log1p(sollya.x) / (sollya.x) poly_degree = sup(guessdegree(approx_fct, approx_interval, S2**-(self.precision.get_field_size()+1))) + 1 Log.report(Log.Debug, "poly_degree is {}", poly_degree) global_poly_object = Polynomial.build_from_approximation(approx_fct, poly_degree, [self.precision]*(poly_degree+1), approx_interval, sollya.absolute) poly_object = global_poly_object # .sub_poly(start_index=1) EXT_PRECISION_MAP = { ML_Binary32: ML_SingleSingle, ML_Binary64: ML_DoubleDouble, ML_SingleSingle: ML_TripleSingle, ML_DoubleDouble: ML_TripleDouble } if not self.precision in EXT_PRECISION_MAP: Log.report(Log.Error, "no extended precision available for {}", self.precision) ext_precision = EXT_PRECISION_MAP[self.precision] # pre_rtp = r . 2^(-e-e_t) + m .2^-e_t pre_rtp = Addition( rcp_mt * TwoMinusEEt, Multiplication( rcp_mt, Multiplication( m, TwoMinusEt, precision=self.precision, tag="pre_mult", debug=debug_multi, ), precision=ext_precision, tag="pre_mult2", debug=debug_multi, ), precision=ext_precision, tag="pre_rtp", debug=debug_multi ) pre_red_vx = Addition( pre_rtp, -1, precision=ext_precision, ) red_vx = Conversion(pre_red_vx, precision=self.precision, tag="red_vx", debug=debug_multi) Log.report(Log.Info, "generating polynomial evaluation scheme") poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, red_vx, unified_precision=self.precision) poly.set_attributes(tag="poly", debug=debug_multi) Log.report(Log.Debug, "{}", global_poly_object.get_sollya_object()) fp_e = Conversion(e + e_t, precision=self.precision, tag="fp_e", debug=debug_multi) ext_poly = Multiplication(red_vx, poly, precision=ext_precision) pre_result = Addition( Addition( fp_e * log2_hi, fp_e * log2_lo, precision=ext_precision ), Addition( Addition( -log_inv_hi, -log_inv_lo, precision=ext_precision ), ext_poly, precision=ext_precision ), precision=ext_precision ) result = Conversion(pre_result, precision=self.precision, tag="result", debug=debug_multi) # main scheme Log.report(Log.Info, "MDL scheme") pre_scheme = ConditionBlock(neg_input, Statement( ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision)) ), ConditionBlock(vx_nan_or_inf, ConditionBlock(vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement( ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid) ), Return(FP_QNaN(self.precision)) ) ), Return(result) ) ) scheme = pre_scheme return scheme
def generic_poly_split(offset_fct, indexing, target_eps, coeff_precision, vx): """ generate the meta approximation for @p offset_fct over several intervals defined by @p indexing object For each sub-interval, a polynomial approximation with maximal_error @p target_eps is tabulated, and evaluated using format @p coeff_precision. The input variable is @p vx """ # computing degree for a different polynomial approximation on each # sub-interval poly_degree_list = [ int(sup(guessdegree(offset_fct(offset), sub_interval, target_eps))) for offset, sub_interval in indexing.get_offseted_sub_list() ] poly_max_degree = max(poly_degree_list) # tabulating polynomial coefficients on split_num sub-interval of interval poly_table = ML_NewTable( dimensions=[indexing.split_num, poly_max_degree + 1], storage_precision=coeff_precision, const=True) offset_table = ML_NewTable(dimensions=[indexing.split_num], storage_precision=coeff_precision, const=True) max_error = 0.0 for sub_index in range(indexing.split_num): poly_degree = poly_degree_list[sub_index] offset, approx_interval = indexing.get_offseted_sub_interval(sub_index) offset_table[sub_index] = offset if poly_degree == 0: # managing constant approximation separately since it seems # to break sollya local_approx = coeff_precision.round_sollya_object( offset_fct(offset)(inf(approx_interval))) poly_table[sub_index][0] = local_approx for monomial_index in range(1, poly_max_degree + 1): poly_table[sub_index][monomial_index] = 0 approx_error = sollya.infnorm( offset_fct(offset) - local_approx, approx_interval) else: poly_object, approx_error = Polynomial.build_from_approximation_with_error( offset_fct(offset), poly_degree, [coeff_precision] * (poly_degree + 1), approx_interval, sollya.relative) for monomial_index in range(poly_max_degree + 1): if monomial_index <= poly_degree: poly_table[sub_index][ monomial_index] = poly_object.coeff_map[monomial_index] else: poly_table[sub_index][monomial_index] = 0 max_error = max(approx_error, max_error) Log.report(Log.Debug, "max approx error is {}", max_error) # indexing function: derive index from input @p vx value poly_index = indexing.get_index_node(vx) poly_index.set_attributes(tag="poly_index", debug=debug_multi) ext_precision = get_extended_fp_precision(coeff_precision) # building polynomial evaluation scheme offset = TableLoad(offset_table, poly_index, precision=coeff_precision, tag="offset", debug=debug_multi) poly = TableLoad(poly_table, poly_index, poly_max_degree, precision=coeff_precision, tag="poly_init", debug=debug_multi) red_vx = Subtraction(vx, offset, precision=vx.precision, tag="red_vx", debug=debug_multi) for monomial_index in range(poly_max_degree, -1, -1): coeff = TableLoad(poly_table, poly_index, monomial_index, precision=coeff_precision, tag="poly_%d" % monomial_index, debug=debug_multi) #fma_precision = coeff_precision if monomial_index > 1 else ext_precision fma_precision = coeff_precision poly = FMA(red_vx, poly, coeff, precision=fma_precision) #return Conversion(poly, precision=coeff_precision) #return poly.hi return poly
def generate_scheme(self): # declaring main input variable vx = self.implementation.add_input_variable("x", self.precision) # declaring approximation parameters index_size = 6 num_iteration = 8 Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) def cbrt_newton_iteration(current_approx, input_value, input_inverse): # Cubic root of A is approximated by a Newton-Raphson iteration # on f(x) = 1 - A / x^3 # x_n+1 = 4/3 * x_n - x_n^4 / (3 * A) # x_n+1 = 1/3 * (x_n * (1 - x_n^3/A) + x_n) approx_triple = Multiplication( current_approx, Multiplication(current_approx, current_approx)) diff = FMSN(approx_triple, input_inverse, Constant(1, precision=self.precision)) injection = FMA( Multiplication( current_approx, Constant(1 / 3.0, precision=self.precision), ), diff, current_approx) new_approx = injection return new_approx reduced_vx = MantissaExtraction(vx, precision=self.precision) int_precision = self.precision.get_integer_format() cbrt_approx_table = ML_NewTable( dimensions=[2**index_size, 1], storage_precision=self.precision, tag=self.uniquify_name("cbrt_approx_table")) for i in range(2**index_size): input_value = 1 + i / SollyaObject(2**index_size) cbrt_approx = cbrt(input_value) cbrt_approx_table[i][0] = round(cbrt_approx, self.precision.get_sollya_object(), RN) # Modulo operations will returns a reduced exponent within [-3, 2] # so we approximate cbrt on this interval (with index offset by -3) cbrt_mod_table = ML_NewTable(dimensions=[6, 1], storage_precision=self.precision, tag=self.uniquify_name("cbrt_mod_table")) for i in range(6): input_value = SollyaObject(2)**(i - 3) cbrt_mod_table[i][0] = round(cbrt(input_value), self.precision.get_sollya_object(), RN) vx_int = TypeCast(reduced_vx, precision=int_precision) mask = BitLogicRightShift(vx_int, self.precision.get_precision() - index_size, precision=int_precision) mask = BitLogicAnd(mask, Constant(2**index_size - 1, precision=int_precision), precision=int_precision, tag="table_index", debug=debug_multi) table_index = mask int_precision = self.precision.get_integer_format() exp_vx = ExponentExtraction(vx, precision=int_precision, tag="exp_vx") exp_vx_third = Division(exp_vx, Constant(3, precision=int_precision), precision=int_precision, tag="exp_vx_third") exp_vx_mod = Modulo(exp_vx, Constant(3, precision=int_precision), precision=int_precision, tag="exp_vx_mod", debug=debug_multi) # offset on modulo to make sure table index is positive exp_vx_mod = exp_vx_mod + 3 cbrt_mod = TableLoad(cbrt_mod_table, exp_vx_mod, Constant(0), tag="cbrt_mod") init_approx = Multiplication( Multiplication( # approx cbrt(mantissa) TableLoad(cbrt_approx_table, table_index, Constant(0, precision=ML_Int32), tag="seed", debug=debug_multi), # approx cbrt(2^(e%3)) cbrt_mod, tag="init_mult", debug=debug_multi, precision=self.precision), # 2^(e/3) ExponentInsertion(exp_vx_third, precision=self.precision, tag="exp_vx_third", debug=debug_multi), tag="init_approx", debug=debug_multi, precision=self.precision) inverse_red_vx = Division(Constant(1, precision=self.precision), reduced_vx) inverse_vx = Division(Constant(1, precision=self.precision), vx) current_approx = init_approx for i in range(num_iteration): #current_approx = cbrt_newton_iteration(current_approx, reduced_vx, inverse_red_vx) current_approx = cbrt_newton_iteration(current_approx, vx, inverse_vx) current_approx.set_attributes(tag="approx_%d" % i, debug=debug_multi) result = current_approx result.set_attributes(tag="result", debug=debug_multi) # last iteration ext_precision = ML_DoubleDouble xn_2 = Multiplication(current_approx, current_approx, precision=ext_precision) xn_3 = Multiplication(current_approx, xn_2, precision=ext_precision) FourThird = Constant(4 / SollyaObject(3), precision=ext_precision) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = Statement(Return(result)) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.get_input_precision()) sollya_precision = self.get_input_precision().get_sollya_object() # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) # testing special value inputs test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") # if input is a signaling NaN, raise an invalid exception and returns # a quiet NaN return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) vx_exp = ExponentExtraction(vx, tag="vx_exp", debug=debugd) int_precision = self.precision.get_integer_format() # log2(vx) # r = vx_mant # e = vx_exp # vx reduced to r in [1, 2[ # log2(vx) = log2(r * 2^e) # = log2(r) + e # ## log2(r) is approximated by # log2(r) = log2(inv_seed(r) * r / inv_seed(r) # = log2(inv_seed(r) * r) - log2(inv_seed(r)) # inv_seed(r) in ]1/2, 1] => log2(inv_seed(r)) in ]-1, 0] # # inv_seed(r) * r ~ 1 # we can easily tabulate -log2(inv_seed(r)) # # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision=self.precision) dummy_div_seed = DivisionSeed(dummy_var, precision=self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter=lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_NewTable(dimensions=[2**table_index_size, 2], storage_precision=self.precision, tag=self.uniquify_name("inv_table")) # value for index 0 is set to 0.0 log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in range(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 #inv_value = (1.0 + (inv_approx_table[i][0] / S2**9) ) * S2**-1 #print inv_approx_table[i][0], inv_value inv_value = inv_approx_table[i][0] value_high_bitsize = self.precision.get_field_size() - ( self.precision.get_exponent_size() + 1) value_high = round(log2(inv_value), value_high_bitsize, sollya.RN) value_low = round( log2(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low def compute_log(_vx, exp_corr_factor=None): _vx_mant = MantissaExtraction(_vx, tag="_vx_mant", precision=self.precision, debug=debug_lftolx) _vx_exp = ExponentExtraction(_vx, tag="_vx_exp", debug=debugd) # The main table is indexed by the 7 most significant bits # of the mantissa table_index = inv_approx_table.index_function(_vx_mant) table_index.set_attributes(tag="table_index", debug=debuglld) # argument reduction # Using AND -2 to exclude LSB set to 1 for Newton-Raphson convergence # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd( TypeCast(DivisionSeed(_vx_mant, precision=self.precision, tag="seed", debug=debug_lftolx, silent=True), precision=ML_UInt64), Constant(-2, precision=ML_UInt64), precision=ML_UInt64), precision=self.precision, tag="pre_arg_red_index", debug=debug_lftolx) arg_red_index = Select(Equal(table_index, 0), 1.0, pre_arg_red_index, tag="arg_red_index", debug=debug_lftolx) _red_vx = FMA(arg_red_index, _vx_mant, -1.0) _red_vx.set_attributes(tag="_red_vx", debug=debug_lftolx) inv_err = S2**-inv_approx_table.index_size red_interval = Interval(1 - inv_err, 1 + inv_err) # return in case of standard (non-special) input _log_inv_lo = TableLoad(log_table, table_index, 1, tag="log_inv_lo", debug=debug_lftolx) _log_inv_hi = TableLoad(log_table, table_index, 0, tag="log_inv_hi", debug=debug_lftolx) Log.report(Log.Verbose, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup( guessdegree( log2(1 + sollya.x) / sollya.x, approx_interval, S2** -(self.precision.get_field_size() * 1.1))) + 1 sollya.settings.display = sollya.hexadecimal global_poly_object, approx_error = Polynomial.build_from_approximation_with_error( log2(1 + sollya.x) / sollya.x, poly_degree, [self.precision] * (poly_degree + 1), approx_interval, sollya.absolute, error_function=lambda p, f, ai, mod, t: sollya.dirtyinfnorm( p - f, ai)) Log.report( Log.Info, "poly_degree={}, approx_error={}".format( poly_degree, approx_error)) poly_object = global_poly_object.sub_poly(start_index=1, offset=1) #poly_object = global_poly_object.sub_poly(start_index=0,offset=0) Attributes.set_default_silent(True) Attributes.set_default_rounding_mode(ML_RoundToNearest) Log.report(Log.Verbose, "generating polynomial evaluation scheme") pre_poly = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object, _red_vx, unified_precision=self.precision) _poly = FMA(pre_poly, _red_vx, global_poly_object.get_cst_coeff(0, self.precision)) _poly.set_attributes(tag="poly", debug=debug_lftolx) Log.report( Log.Verbose, "sollya global_poly_object: {}".format( global_poly_object.get_sollya_object())) Log.report( Log.Verbose, "sollya poly_object: {}".format( poly_object.get_sollya_object())) corr_exp = _vx_exp if exp_corr_factor == None else _vx_exp + exp_corr_factor Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() pre_result = -_log_inv_hi + (_red_vx * _poly + (-_log_inv_lo)) pre_result.set_attributes(tag="pre_result", debug=debug_lftolx) exact_log2_hi_exp = Conversion(corr_exp, precision=self.precision) exact_log2_hi_exp.set_attributes(tag="exact_log2_hi_hex", debug=debug_lftolx) _result = exact_log2_hi_exp + pre_result return _result, _poly, _log_inv_lo, _log_inv_hi, _red_vx result, poly, log_inv_lo, log_inv_hi, red_vx = compute_log(vx) result.set_attributes(tag="result", debug=debug_lftolx) # specific input value predicate neg_input = Comparison(vx, 0, likely=False, specifier=Comparison.Less, debug=debugd, tag="neg_input") vx_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debugd, tag="nan_or_inf") vx_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, debug=debugd, tag="vx_snan") vx_inf = Test(vx, specifier=Test.IsInfty, likely=False, debug=debugd, tag="vx_inf") vx_subnormal = Test(vx, specifier=Test.IsSubnormal, likely=False, debug=debugd, tag="vx_subnormal") vx_zero = Test(vx, specifier=Test.IsZero, likely=False, debug=debugd, tag="vx_zero") exp_mone = Equal(vx_exp, -1, tag="exp_minus_one", debug=debugd, likely=False) vx_one = Equal(vx, 1.0, tag="vx_one", likely=False, debug=debugd) # Specific specific for the case exp == -1 # log2(x) = log2(m) - 1 # # as m in [1, 2[, log2(m) in [0, 1[ # if r is close to 2, a catastrophic cancellation can occur # # r = seed(m) # log2(x) = log2(seed(m) * m / seed(m)) - 1 # = log2(seed(m) * m) - log2(seed(m)) - 1 # # for m really close to 2 => seed(m) = 0.5 # => log2(x) = log2(0.5 * m) # = result_exp_m1 = (-log_inv_hi - 1.0) + FMA(poly, red_vx, -log_inv_lo) result_exp_m1.set_attributes(tag="result_exp_m1", debug=debug_lftolx) m100 = -100 S2100 = Constant(S2**100, precision=self.precision) result_subnormal, _, _, _, _ = compute_log(vx * S2100, exp_corr_factor=m100) result_subnormal.set_attributes(tag="result_subnormal", debug=debug_lftolx) one_err = S2**-7 approx_interval_one = Interval(-one_err, one_err) red_vx_one = vx - 1.0 poly_degree_one = sup( guessdegree( log(1 + x) / x, approx_interval_one, S2** -(self.precision.get_field_size() + 1))) + 1 poly_object_one = Polynomial.build_from_approximation( log(1 + sollya.x) / sollya.x, poly_degree_one, [self.precision] * (poly_degree_one + 1), approx_interval_one, absolute).sub_poly(start_index=1) poly_one = PolynomialSchemeEvaluator.generate_horner_scheme( poly_object_one, red_vx_one, unified_precision=self.precision) poly_one.set_attributes(tag="poly_one", debug=debug_lftolx) result_one = red_vx_one + red_vx_one * poly_one cond_one = (vx < (1 + one_err)) & (vx > (1 - one_err)) cond_one.set_attributes(tag="cond_one", debug=debugd, likely=False) # main scheme pre_scheme = ConditionBlock( neg_input, Statement(ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision))), ConditionBlock( vx_nan_or_inf, ConditionBlock( vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement(ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid)), Return(FP_QNaN(self.precision)))), ConditionBlock( vx_subnormal, ConditionBlock( vx_zero, Statement( ClearException(), Raise(ML_FPE_DivideByZero), Return(FP_MinusInfty(self.precision)), ), Statement(ClearException(), result_subnormal, Return(result_subnormal))), ConditionBlock( vx_one, Statement( ClearException(), Return(FP_PlusZero(self.precision)), ), ConditionBlock(exp_mone, Return(result_exp_m1), Return(result)))))) scheme = Statement(result, pre_scheme) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", self.precision) sollya_precision = self.get_input_precision().sollya_object # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) log2_hi_value = round(log(2), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) log2_lo_value = round(log(2) - log2_hi_value, self.precision.sollya_object, sollya.RN) log2_hi = Constant(log2_hi_value, precision = self.precision) log2_lo = Constant(log2_lo_value, precision = self.precision) vx_exp = ExponentExtraction(vx, tag = "vx_exp", debug = debugd) int_precision = self.precision.get_integer_format() # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision = self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision = self.precision) inv_approx_table = self.processor.get_recursive_implementation(dummy_div_seed, language = None, table_getter = lambda self: self.approx_table_map) # table creation table_index_size = 7 log_table = ML_NewTable(dimensions = [2**table_index_size, 2], storage_precision = self.precision) log_table[0][0] = 0.0 log_table[0][1] = 0.0 for i in range(1, 2**table_index_size): #inv_value = (1.0 + (self.processor.inv_approx_table[i] / S2**9) + S2**-52) * S2**-1 inv_value = inv_approx_table[i] # (1.0 + (inv_approx_table[i] / S2**9) ) * S2**-1 value_high = round(log(inv_value), self.precision.get_field_size() - (self.precision.get_exponent_size() + 1), sollya.RN) value_low = round(log(inv_value) - value_high, sollya_precision, sollya.RN) log_table[i][0] = value_high log_table[i][1] = value_low vx_exp = ExponentExtraction(vx, tag = "vx_exp", debug = debugd) # case close to 0: ctz ctz_exp_limit = -7 ctz_cond = vx_exp < ctz_exp_limit ctz_interval = Interval(-S2**ctz_exp_limit, S2**ctz_exp_limit) ctz_poly_degree = sup(guessdegree(log1p(sollya.x)/sollya.x, ctz_interval, S2**-(self.precision.get_field_size()+1))) + 1 ctz_poly_object = Polynomial.build_from_approximation(log1p(sollya.x)/sollya.x, ctz_poly_degree, [self.precision]*(ctz_poly_degree+1), ctz_interval, sollya.absolute) Log.report(Log.Info, "generating polynomial evaluation scheme") ctz_poly = PolynomialSchemeEvaluator.generate_horner_scheme(ctz_poly_object, vx, unified_precision = self.precision) ctz_poly.set_attributes(tag = "ctz_poly", debug = debug_lftolx) ctz_result = vx * ctz_poly neg_input = Comparison(vx, -1, likely = False, specifier = Comparison.Less, debug = debugd, tag = "neg_input") vx_nan_or_inf = Test(vx, specifier = Test.IsInfOrNaN, likely = False, debug = debugd, tag = "nan_or_inf") vx_snan = Test(vx, specifier = Test.IsSignalingNaN, likely = False, debug = debugd, tag = "snan") vx_inf = Test(vx, specifier = Test.IsInfty, likely = False, debug = debugd, tag = "inf") vx_subnormal = Test(vx, specifier = Test.IsSubnormal, likely = False, debug = debugd, tag = "vx_subnormal") log_function_code = CodeFunction("new_log", [Variable("x", precision = ML_Binary64)], output_format = ML_Binary64) log_call_generator = FunctionOperator(log_function_code.get_name(), arity = 1, output_precision = ML_Binary64, declare_prototype = log_function_code) newlog_function = FunctionObject(log_function_code.get_name(), (ML_Binary64,), ML_Binary64, log_call_generator) # case away from 0.0 pre_vxp1 = vx + 1.0 pre_vxp1.set_attributes(tag = "pre_vxp1", debug = debug_lftolx) pre_vxp1_exp = ExponentExtraction(pre_vxp1, tag = "pre_vxp1_exp", debug = debugd) cm500 = Constant(-500, precision = ML_Int32) c0 = Constant(0, precision = ML_Int32) cond_scaling = pre_vxp1_exp > 2**(self.precision.get_exponent_size()-2) scaling_factor_exp = Select(cond_scaling, cm500, c0) scaling_factor = ExponentInsertion(scaling_factor_exp, precision = self.precision, tag = "scaling_factor") vxp1 = pre_vxp1 * scaling_factor vxp1.set_attributes(tag = "vxp1", debug = debug_lftolx) vxp1_exp = ExponentExtraction(vxp1, tag = "vxp1_exp", debug = debugd) vxp1_inv = ReciprocalSeed(vxp1, precision = self.precision, tag = "vxp1_inv", debug = debug_lftolx, silent = True) vxp1_dirty_inv = ExponentInsertion(-vxp1_exp, precision = self.precision, tag = "vxp1_dirty_inv", debug = debug_lftolx) table_index = BitLogicAnd(BitLogicRightShift(TypeCast(vxp1, precision = int_precision, debug = debuglx), self.precision.get_field_size() - 7, debug = debuglx), 0x7f, tag = "table_index", debug = debuglx) # argument reduction # TODO: detect if single operand inverse seed is supported by the targeted architecture pre_arg_red_index = TypeCast(BitLogicAnd(TypeCast(vxp1_inv, precision = ML_UInt64), Constant(-2, precision = ML_UInt64), precision = ML_UInt64), precision = self.precision, tag = "pre_arg_red_index", debug = debug_lftolx) arg_red_index = Select(Equal(table_index, 0), vxp1_dirty_inv, pre_arg_red_index, tag = "arg_red_index", debug = debug_lftolx) red_vxp1 = Select(cond_scaling, arg_red_index * vxp1 - 1.0, (arg_red_index * vx - 1.0) + arg_red_index) #red_vxp1 = arg_red_index * vxp1 - 1.0 red_vxp1.set_attributes(tag = "red_vxp1", debug = debug_lftolx) log_inv_lo = TableLoad(log_table, table_index, 1, tag = "log_inv_lo", debug = debug_lftolx) log_inv_hi = TableLoad(log_table, table_index, 0, tag = "log_inv_hi", debug = debug_lftolx) inv_err = S2**-6 # TODO: link to target DivisionSeed precision Log.report(Log.Info, "building mathematical polynomial") approx_interval = Interval(-inv_err, inv_err) poly_degree = sup(guessdegree(log(1+sollya.x)/sollya.x, approx_interval, S2**-(self.precision.get_field_size()+1))) + 1 global_poly_object = Polynomial.build_from_approximation(log(1+sollya.x)/sollya.x, poly_degree, [self.precision]*(poly_degree+1), approx_interval, sollya.absolute) poly_object = global_poly_object.sub_poly(start_index = 1) Log.report(Log.Info, "generating polynomial evaluation scheme") _poly = PolynomialSchemeEvaluator.generate_horner_scheme(poly_object, red_vxp1, unified_precision = self.precision) _poly.set_attributes(tag = "poly", debug = debug_lftolx) Log.report(Log.Info, global_poly_object.get_sollya_object()) vxp1_inv_exp = ExponentExtraction(vxp1_inv, tag = "vxp1_inv_exp", debug = debugd) corr_exp = Conversion(-vxp1_exp + scaling_factor_exp, precision = self.precision)# vxp1_inv_exp #poly = (red_vxp1) * (1 + _poly) #poly.set_attributes(tag = "poly", debug = debug_lftolx, prevent_optimization = True) pre_result = -log_inv_hi + (red_vxp1 + red_vxp1 * _poly + (-corr_exp * log2_lo - log_inv_lo)) pre_result.set_attributes(tag = "pre_result", debug = debug_lftolx) exact_log2_hi_exp = - corr_exp * log2_hi exact_log2_hi_exp.set_attributes(tag = "exact_log2_hi_exp", debug = debug_lftolx, prevent_optimization = True) #std_result = exact_log2_hi_exp + pre_result exact_log2_lo_exp = - corr_exp * log2_lo exact_log2_lo_exp.set_attributes(tag = "exact_log2_lo_exp", debug = debug_lftolx)#, prevent_optimization = True) init = exact_log2_lo_exp - log_inv_lo init.set_attributes(tag = "init", debug = debug_lftolx, prevent_optimization = True) fma0 = (red_vxp1 * _poly + init) # - log_inv_lo) fma0.set_attributes(tag = "fma0", debug = debug_lftolx) step0 = fma0 step0.set_attributes(tag = "step0", debug = debug_lftolx) #, prevent_optimization = True) step1 = step0 + red_vxp1 step1.set_attributes(tag = "step1", debug = debug_lftolx, prevent_optimization = True) step2 = -log_inv_hi + step1 step2.set_attributes(tag = "step2", debug = debug_lftolx, prevent_optimization = True) std_result = exact_log2_hi_exp + step2 std_result.set_attributes(tag = "std_result", debug = debug_lftolx, prevent_optimization = True) # main scheme Log.report(Log.Info, "MDL scheme") pre_scheme = ConditionBlock(neg_input, Statement( ClearException(), Raise(ML_FPE_Invalid), Return(FP_QNaN(self.precision)) ), ConditionBlock(vx_nan_or_inf, ConditionBlock(vx_inf, Statement( ClearException(), Return(FP_PlusInfty(self.precision)), ), Statement( ClearException(), ConditionBlock(vx_snan, Raise(ML_FPE_Invalid) ), Return(FP_QNaN(self.precision)) ) ), ConditionBlock(vx_subnormal, Return(vx), ConditionBlock(ctz_cond, Statement( Return(ctz_result), ), Statement( Return(std_result) ) ) ) ) ) scheme = pre_scheme return scheme