def test_ref_assign(self): """ test behavior of StaticVectorizer on predicated ReferenceAssign """ va = Variable("a") vb = Variable("b") vc = Variable("c") scheme = Statement( ReferenceAssign(va, Constant(3)), ConditionBlock( (va > vb).modify_attributes(likely=True), Statement(ReferenceAssign(vb, va), ReferenceAssign(va, Constant(11)), Return(va)), ), ReferenceAssign(va, Constant(7)), Return(vb)) vectorized_path = StaticVectorizer().extract_vectorizable_path( scheme, fallback_policy) linearized_most_likely_path = instanciate_variable( vectorized_path.linearized_optree, vectorized_path.variable_mapping) test_result = (isinstance(linearized_most_likely_path, Constant) and linearized_most_likely_path.get_value() == 11) if not test_result: print("test UT_StaticVectorizer failure") print("scheme: {}".format(scheme.get_str())) print("linearized_most_likely_path: {}".format( linearized_most_likely_path)) self.assertTrue(test_result)
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", self.precision) vy = self.implementation.add_input_variable("y", self.precision) Cst0 = Constant(5, precision=self.precision) Cst1 = Constant(7, precision=self.precision) comp = Comparison(vx, vy, specifier=Comparison.Greater, precision=ML_Bool, tag="comp") comp_eq = Comparison(vx, vy, specifier=Comparison.Equal, precision=ML_Bool, tag="comp_eq") scheme = Statement( ConditionBlock( comp, Return(vy, precision=self.precision), ConditionBlock( comp_eq, Return(vx + vy * Cst0 - Cst1, precision=self.precision))), ConditionBlock(comp_eq, Return(Cst1 * vy, precision=self.precision)), Return(vx * vy, precision=self.precision)) return scheme
def generate_scheme(self): # declaring input variable vx = self.implementation.add_input_variable("x", self.precision) vx2 = vx * vx scheme = ConditionBlock( vx > 0, Return(vx - 0.33 * vx2 * vx + (2 / 15.0) * vx * vx2 * vx2), Return(FP_QNaN(self.precision))) return scheme
def generate_scheme(self): vx = self.implementation.add_input_variable("x", FIXED_FORMAT) # declaring specific interval for input variable <x> vx.set_interval(Interval(-1, 1)) acc_format = ML_Custom_FixedPoint_Format(6, 58, False) c = Constant(2, precision=acc_format, tag="C2") ivx = vx add_ivx = Addition( c, Multiplication(ivx, ivx, precision=acc_format, tag="mul"), precision=acc_format, tag="add" ) result = add_ivx input_mapping = {ivx: ivx.get_precision().round_sollya_object(0.125)} error_eval_map = runtime_error_eval.generate_error_eval_graph(result, input_mapping) # dummy scheme to make functionnal code generation scheme = Statement() for node in error_eval_map: scheme.add(error_eval_map[node]) scheme.add(Return(result)) return scheme
def externalize_call(self, optree, arg_list, tag = "foo", result_format = None): # determining return format return_format = optree.get_precision() if result_format is None else result_format assert(not return_format is None and "external call result format must be defined") # function_name = self.main_code_object.declare_free_function_name(tag) function_name = self.name_factory.declare_free_function_name(tag) ext_function = CodeFunction(function_name, output_format = return_format) # creating argument copy arg_map = {} arg_index = 0 for arg in arg_list: arg_tag = arg.get_tag(default = "arg_%d" % arg_index) arg_index += 1 arg_map[arg] = ext_function.add_input_variable(arg_tag, arg.get_precision()) # copying optree while swapping argument for variables optree_copy = optree.copy(copy_map = arg_map) # instanciating external function scheme if isinstance(optree, ML_ArithmeticOperation): function_optree = Statement(Return(optree_copy)) else: function_optree = Statement(optree_copy) ext_function.set_scheme(function_optree) self.name_factory.declare_function(function_name, ext_function.get_function_object()) return ext_function
def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name if self.libm_compliant: return RaiseReturn(*args, precision=self.precision, **kwords) else: return Return(kwords["return_value"], precision=self.precision)
def generate_scheme(self): self.var_mapping = {} for var_index in range(self.arity): # FIXME: maximal arity is 4 var_tag = ["x", "y", "z", "t"][var_index] self.var_mapping[var_tag] = self.implementation.add_input_variable( var_tag, self.get_input_precision(var_index), interval=self.input_intervals[var_index]) self.function_expr = function_parser(self.function_expr_str, self.var_mapping) Log.report(Log.Info, "evaluating function range") evaluate_range(self.function_expr, update_interval=True) Log.report( LOG_VERBOSE_FUNCTION_EXPR, "scheme is: \n{}", self.function_expr.get_str(depth=None, display_interval=True)) # defined copy map to avoid copying input Variables copy_map = dict((var, var) for var in self.var_mapping.items()) function_expr_copy = self.function_expr.copy(copy_map) result, scheme = self.instanciate_graph(function_expr_copy, expand_div=self.expand_div) scheme.add(Return(result, precision=self.precision)) return scheme
def generate_test_wrapper(self, tensor_descriptors, input_tables, output_tables): auto_test = CodeFunction("test_wrapper", output_format=ML_Int32) tested_function = self.implementation.get_function_object() function_name = self.implementation.get_name() failure_report_op = FunctionOperator("report_failure") failure_report_function = FunctionObject("report_failure", [], ML_Void, failure_report_op) printf_success_op = FunctionOperator( "printf", arg_map={0: "\"test successful %s\\n\"" % function_name}, void_function=True, require_header=["stdio.h"]) printf_success_function = FunctionObject("printf", [], ML_Void, printf_success_op) # accumulate element number acc_num = Variable("acc_num", precision=ML_Int64, var_type=Variable.Local) test_loop = self.get_tensor_test_wrapper( tested_function, tensor_descriptors, input_tables, output_tables, acc_num, self.generate_tensor_check_loop) # common test scheme between scalar and vector functions test_scheme = Statement(test_loop, printf_success_function(), Return(Constant(0, precision=ML_Int32))) auto_test.set_scheme(test_scheme) return FunctionGroup([auto_test])
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", self.precision) vy = self.implementation.add_input_variable("y", self.precision) scheme = Return(vx + vy) return scheme
def generate_scheme(self): size_format = ML_Int32 # Matrix storage in_storage = self.implementation.add_input_variable( "buffer_in", ML_Pointer_Format(self.precision)) kernel_storage = self.implementation.add_input_variable( "buffer_kernel", ML_Pointer_Format(self.precision)) out_storage = self.implementation.add_input_variable( "buffer_out", ML_Pointer_Format(self.precision)) # Matrix sizes w = self.implementation.add_input_variable("w", size_format) h = self.implementation.add_input_variable("h", size_format) # A is a (n x p) matrix in row-major tIn = Tensor(in_storage, TensorDescriptor([w, h], [1, w], self.precision)) # B is a (p x m) matrix in row-major kernel_strides = [1] for previous_dim in self.kernel_size[:-1]: kernel_strides.append(previous_dim * kernel_strides[-1]) print("kernel_strides: {}".format(kernel_strides)) tKernel = Tensor( kernel_storage, TensorDescriptor(self.kernel_size, kernel_strides, self.precision)) # C is a (n x m) matrix in row-major tOut = Tensor(out_storage, TensorDescriptor([w, h], [1, w], self.precision)) index_format = ML_Int32 # main NDRange description i = Variable("i", precision=index_format, var_type=Variable.Local) j = Variable("j", precision=index_format, var_type=Variable.Local) k_w = Variable("k_w", precision=index_format, var_type=Variable.Local) k_h = Variable("k_h", precision=index_format, var_type=Variable.Local) result = NDRange([IterRange(i, 0, w - 1), IterRange(j, 0, h - 1)], WriteAccessor( tOut, [i, j], Sum(Sum(Multiplication( ReadAccessor(tIn, [i + k_w, j - k_h], self.precision), ReadAccessor(tKernel, [k_w, k_h], self.precision)), IterRange(k_w, -(self.kernel_size[0] - 1) // 2, (self.kernel_size[0] - 1) // 2), precision=self.precision), IterRange(k_h, -(self.kernel_size[1] - 1) // 2, (self.kernel_size[1] - 1) // 2), precision=self.precision))) mdl_scheme = expand_ndrange(result) print("mdl_scheme:\n{}".format(mdl_scheme.get_str(depth=None))) return Statement(mdl_scheme, Return())
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", self.precision) approx = ReciprocalSeed(vx, precision=self.precision, tag="approx") result = approx scheme = Return(result, precision=self.precision, debug=debug_multi) return scheme
def generate_tensor_check_loop(self, tensor_descriptors, input_tables, output_tables): # unpack tensor descriptors tuple (input_tensor_descriptor_list, output_tensor_descriptor_list) = tensor_descriptors # internal array iterator index vj = Variable("j", precision=ML_UInt32, var_type=Variable.Local) printf_error_detail_function = self.get_printf_error_detail_fct( output_tensor_descriptor_list[0]) NUM_INPUT_ARRAY = len(input_tables) # generate the expected table for the whole multi-array expected_tables = self.generate_expected_table(tensor_descriptors, input_tables) # global statement to list all checks check_statement = Statement() # implement check for each output tensor for out_id, out_td in enumerate(output_tensor_descriptor_list): # expected values for the (vj)-th entry of the sub-array expected_values = [ TableLoad(expected_tables[out_id], vj, i) for i in range(self.accuracy.get_num_output_value()) ] # local result for the (vj)-th entry of the sub-array local_result = TableLoad(output_tables[out_id], vj) array_len = out_td.get_bounding_size() if self.break_error: return_statement_break = Statement( printf_error_detail_function(*((vj, ) + (local_result, ))), self.accuracy.get_output_print_call( self.function_name, output_values)) else: return_statement_break = Statement( printf_error_detail_function(*((vj, ) + (local_result, ))), self.accuracy.get_output_print_call( self.function_name, expected_values), Return(Constant(1, precision=ML_Int32))) check_array_loop = Loop( ReferenceAssign(vj, 0), vj < array_len, Statement( ConditionBlock( self.accuracy.get_output_check_test( local_result, expected_values), return_statement_break), ReferenceAssign(vj, vj + 1), )) check_statement.add(check_array_loop) return check_statement
def generate_function_from_optree(name_factory, optree, arg_list, tag="foo", result_format=None): """ Function which transform a sub-graph @p optree whose inputs are @p arg_list into a meta function @param optree operation graph to be incorporated as function boday @param arg_list list of @p optree's parameters to be used as function arguments @param name_factory engine to generate unique function name and to register function @param tag string to be used as seed to generate function name @param result_format hint to indicate function's return format (if optree is not an arithmetic operation (e.g. it already contains a Return node, then @p result_format must be used to specify the funciton return format) @return CodeFunction object containing the function implementation (plus the function would have been declared into name_factory) """ # determining return format return_format = optree.get_precision( ) if result_format is None else result_format assert (not return_format is None and "external call result format must be defined") function_name = name_factory.declare_free_function_name(tag) ext_function = CodeFunction(function_name, output_format=return_format) # creating argument copy arg_map = {} arg_index = 0 for arg in arg_list: arg_tag = arg.get_tag(default="arg_%d" % arg_index) arg_index += 1 arg_map[arg] = ext_function.add_input_variable(arg_tag, arg.get_precision()) # extracting const table to make sure then are not duplicated table_set = extract_tables(optree) arg_map.update({table: table for table in table_set if table.const}) # copying optree while swapping argument for variables optree_copy = optree.copy(copy_map=arg_map) # instanciating external function scheme if isinstance(optree, ML_ArithmeticOperation): function_optree = Statement(Return(optree_copy)) else: function_optree = Statement(optree_copy) ext_function.set_scheme(function_optree) name_factory.declare_function(function_name, ext_function.get_function_object()) return ext_function
def generate_scheme(self): var = self.implementation.add_input_variable("x", self.precision) var_y = self.implementation.add_input_variable("y", self.precision) var_z = self.implementation.add_input_variable("z", self.precision) mult = Multiplication(var, var_z, precision=self.precision) add = Addition(var_y, mult, precision=self.precision) test_program = Statement( add, Return(add) ) return test_program
def generate_scalar_scheme(self, vx, vy): div = Division(vx, vy, precision=self.precision) div_if = Trunc(div, precision=self.precision) rem = Variable("rem", var_type=Variable.Local, precision=self.precision) qi = Variable("qi", var_type=Variable.Local, precision=self.precision) qi_bound = Constant(S2**self.precision.get_mantissa_size()) init_rem = FusedMultiplyAdd(-div_if, vy, vx) # factorizing 1 / vy to save time # NOTES: it makes rem / vy approximate # shared_rcp = Division(1, vy, precision=self.precision) iterative_fmod = Loop( Statement( ReferenceAssign(rem, init_rem), ReferenceAssign(qi, div_if), ), Abs(qi) > qi_bound, Statement( ReferenceAssign( qi, #Trunc(shared_rcp * rem, precision=self.precision) Trunc(rem / vy, precision=self.precision)), ReferenceAssign(rem, FMA(-qi, vy, rem)))) scheme = Statement( rem, # shared_rcp, iterative_fmod, ConditionBlock( # if rem's sign and vx sign mismatch (rem * vx < 0.0).modify_attributes(tag="update_cond", debug=debug_multi), Return(rem + vy), Return(rem), )) return scheme
def generate_scheme(self): """ main scheme generation """ input_precision = self.precision output_precision = self.precision # declaring main input variable x_interval = Interval(-10.3, 10.7) var_x = self.implementation.add_input_variable("x", input_precision, interval=x_interval) y_interval = Interval(-17.9, 17.2) var_y = self.implementation.add_input_variable("y", input_precision, interval=y_interval) z_interval = Interval(-70.3, -57.7) var_z = self.implementation.add_input_variable("z", input_precision, interval=z_interval) min_yz = Min(var_z, var_y) cst0 = Constant(42.5, tag="cst0", precision=self.precision) cst1 = Constant(2.5, tag="cst1", precision=self.precision) cst2 = Constant(12.5, tag="cst2", precision=self.precision) new_cst = cst0 + cst1 * cst2 result = min_yz + new_cst scheme = ConditionBlock( LogicalAnd( LogicalOr(cst0 > cst1, LogicalNot(cst1 > cst0)), var_x > var_y, ), Return(result), Return(cst2)) return scheme
def generate_scheme(self): size_format = ML_Int32 # Matrix storage A_storage = self.implementation.add_input_variable("buffer_a", ML_Pointer_Format(self.precision)) B_storage = self.implementation.add_input_variable("buffer_b", ML_Pointer_Format(self.precision)) C_storage = self.implementation.add_input_variable("buffer_c", ML_Pointer_Format(self.precision)) # Matrix sizes n = self.implementation.add_input_variable("n", size_format) m = self.implementation.add_input_variable("m", size_format) p = self.implementation.add_input_variable("p", size_format) # A is a (n x p) matrix in row-major tA = Tensor(A_storage, TensorDescriptor([p, n], [1, p], self.precision)) # B is a (p x m) matrix in row-major tB = Tensor(B_storage, TensorDescriptor([m, p], [1, m], self.precision)) # C is a (n x m) matrix in row-major tC = Tensor(C_storage, TensorDescriptor([m, n], [1, m], self.precision)) index_format = ML_Int32 # i = Variable("i", precision=index_format, var_type=Variable.Local) j = Variable("j", precision=index_format, var_type=Variable.Local) k = Variable("k", precision=index_format, var_type=Variable.Local) result = NDRange( [IterRange(j, 0, m-1), IterRange(i, 0, n -1)], WriteAccessor( tC, [j, i], Sum( Multiplication( ReadAccessor(tA, [k, i], self.precision), ReadAccessor(tB, [j, k], self.precision), precision=self.precision), IterRange(k, 0, p - 1), precision=self.precision))) #mdl_scheme = expand_ndrange(exchange_loop_order(tile_ndrange(result, {j: 2, i: 2}), [1, 0])) if self.vectorize: mdl_scheme = expand_ndrange(vectorize_ndrange(result, j, 4)) else: mdl_scheme = expand_ndrange(exchange_loop_order(tile_ndrange(result, {j: 2, i: 2}), [1, 0])) print("mdl_scheme:\n{}".format(mdl_scheme.get_str(depth=None, display_precision=True))) return Statement( mdl_scheme, Return() )
def generate_scheme(self): # declaring function input variable vx = self.implementation.add_input_variable("x", self.get_input_precision(0)) bf16_params = ML_NewTable(dimensions=[self.table_size], storage_precision=BFloat16) for i in range(self.table_size): bf16_params[i] = 1.1**i conv_vx = Conversion(TableLoad(bf16_params, vx), precision=ML_Binary32, tag="conv_vx", debug=debug_multi) result = conv_vx scheme = Return(result, precision=self.precision, debug=debug_multi) return scheme
def generate_scalar_scheme(self, vx): output_precision = self.precision input_precision = vx.get_precision() bias = -output_precision.get_bias() bound_exp = Max(Min( vx, output_precision.get_emax(), precision=input_precision), output_precision.get_emin_normal(), precision=input_precision) + bias scheme = Return(ExponentInsertion(bound_exp, specifier=ExponentInsertion.NoOffset, precision=self.precision), tag="result", debug=debug_multi) return scheme
def generate_scheme(self): # declare a new input parameters vx whose tag is "x" and # whose format is single precision vx = self.implementation.add_input_variable("x", self.get_input_precision(0)) # declare a new input parameters vy whose tag is "y" and # whose format is single precision vy = self.implementation.add_input_variable("x", self.get_input_precision(0)) # declare main operation graph for the meta-function: # a single Statement containing a single return statement which # the addition of the two inputs variable in single-precision main_scheme = Statement( Return(vx + vy, precision=ML_Binary32) ) return main_scheme
def generate_scheme(self): """ generate an operation unitary bench test scheme (graph of operation implementing latency computation on a dependent sequence of self.op_class)""" unroll_factor = self.unroll_factor test_num = self.test_num bench_statement = metaop.Statement() # floating-point bench for op_class in self.operation_map: for output_precision in self.operation_map[op_class]: for predicate in OPERATOR_BENCH_MAP[op_class]: if predicate(op_class, output_precision, None): op_bench = OPERATOR_BENCH_MAP[op_class][predicate] bench_statement.add( op_bench(output_precision).generate_bench( self.processor, test_num, unroll_factor)) bench_statement.add(Return(0)) return bench_statement
def vectorize_function_scheme(vectorizer, name_factory, scalar_scheme, scalar_output_format, scalar_arg_list, vector_size, sub_vector_size=None): """ Use a vectorization engine @p vectorizer to vectorize the sub-graph @p scalar_scheme, that is transforming and inputs and outputs from scalar to vectors and performing required internal path duplication """ sub_vector_size = vector_size if sub_vector_size is None else sub_vector_size vec_arg_list, vector_scheme, vector_mask = \ vectorizer.vectorize_scheme(scalar_scheme, scalar_arg_list, vector_size, sub_vector_size) vector_output_format = vectorize_format(scalar_output_format, vector_size) vec_res = Variable("vec_res", precision=vector_output_format, var_type=Variable.Local) vector_mask.set_attributes(tag="vector_mask", debug=debug_multi) callback_name = "scalar_callback" scalar_callback_fct = generate_function_from_optree( name_factory, scalar_scheme, scalar_arg_list, callback_name, scalar_output_format) scalar_callback = scalar_callback_fct.get_function_object() if no_scalar_fallback_required(vector_mask): function_scheme = Statement( Return(vector_scheme, precision=vector_output_format)) function_scheme = generate_c_vector_wrapper(vector_size, vec_arg_list, vector_scheme, vector_mask, vec_res, scalar_callback) return vec_res, vec_arg_list, function_scheme, scalar_callback, scalar_callback_fct
def generic_atan2_generate(self, _vx, vy=None): """ if vy is None, compute atan(_vx), else compute atan2(vy / vx) """ if vy is None: # approximation # if abs_vx <= 1.0 then atan(abx_vx) is directly approximated # if abs_vx > 1.0 then atan(abs_vx) = pi/2 - atan(1 / abs_vx) # # for vx >= 0, atan(vx) = atan(abs_vx) # # for vx < 0, atan(vx) = -atan(abs_vx) for vx < 0 # = -pi/2 + atan(1 / abs_vx) vx = _vx sign_cond = vx < 0 abs_vx = Select(vx < 0, -vx, vx, tag="abs_vx", debug=debug_multi) bound_cond = abs_vx > 1 inv_abs_vx = 1 / abs_vx # condition to select subtraction cond = LogicalOr(LogicalAnd(vx < 0, LogicalNot(bound_cond)), vx > 1, tag="cond", debug=debug_multi) # reduced argument red_vx = Select(bound_cond, inv_abs_vx, abs_vx, tag="red_vx", debug=debug_multi) offset = None else: # bound_cond is True iff Abs(vy / _vx) > 1.0 bound_cond = Abs(vy) > Abs(_vx) bound_cond.set_attributes(tag="bound_cond", debug=debug_multi) # vx and vy are of opposite signs #sign_cond = (_vx * vy) < 0 # using cast to int(signed) and bitwise xor # to determine if _vx and vy are of opposite sign rapidly fast_sign_cond = BitLogicXor( TypeCast(_vx, precision=self.precision.get_integer_format()), TypeCast(vy, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format()) < 0 # sign_cond = (_vx * vy) < 0 sign_cond = fast_sign_cond sign_cond.set_attributes(tag="sign_cond", debug=debug_multi) # condition to select subtraction # TODO: could be accelerated if LogicalXor existed slow_cond = LogicalOr( LogicalAnd(sign_cond, LogicalNot(bound_cond)), # 1 < (vy / _vx) < 0 LogicalAnd(bound_cond, LogicalNot(sign_cond)), # (vy / _vx) > 1 tag="cond", debug=debug_multi) cond = slow_cond numerator = Select(bound_cond, _vx, vy, tag="numerator", debug=debug_multi) denominator = Select(bound_cond, vy, _vx, tag="denominator", debug=debug_multi) # reduced argument red_vx = Abs(numerator) / Abs(denominator) red_vx.set_attributes(tag="red_vx", debug=debug_multi) offset = Select( _vx > 0, Constant(0, precision=self.precision), # vx < 0 Select( sign_cond, # vy > 0 Constant(sollya.pi, precision=self.precision), Constant(-sollya.pi, precision=self.precision), precision=self.precision), precision=self.precision, tag="offset") approx_fct = sollya.atan(sollya.x) if self.method == "piecewise": sign_vx = Select(cond, -1, 1, precision=self.precision, tag="sign_vx", debug=debug_multi) cst_sign = Select(sign_cond, -1, 1, precision=self.precision, tag="cst_sign", debug=debug_multi) cst = cst_sign * Select( bound_cond, sollya.pi / 2, 0, precision=self.precision) cst.set_attributes(tag="cst", debug=debug_multi) bound_low = 0.0 bound_high = 1.0 num_intervals = self.num_sub_intervals error_threshold = S2**-(self.precision.get_mantissa_size() + 8) approx, eval_error = piecewise_approximation( approx_fct, red_vx, self.precision, bound_low=bound_low, bound_high=bound_high, max_degree=None, num_intervals=num_intervals, error_threshold=error_threshold, odd=True) result = cst + sign_vx * approx result.set_attributes(tag="result", precision=self.precision, debug=debug_multi) elif self.method == "single": approx_interval = Interval(0, 1.0) # determining the degree of the polynomial approximation poly_degree_range = sollya.guessdegree( approx_fct / sollya.x, approx_interval, S2**-(self.precision.get_field_size() + 2)) poly_degree = int(sollya.sup(poly_degree_range)) + 4 Log.report(Log.Info, "poly_degree={}".format(poly_degree)) # arctan is an odd function, so only odd coefficient must be non-zero poly_degree_list = list(range(1, poly_degree + 1, 2)) poly_object, poly_error = Polynomial.build_from_approximation_with_error( approx_fct, poly_degree_list, [1] + [self.precision.get_sollya_object()] * (len(poly_degree_list) - 1), approx_interval) odd_predicate = lambda index, _: ((index - 1) % 4 != 0) even_predicate = lambda index, _: (index != 1 and (index - 1) % 4 == 0) poly_odd_object = poly_object.sub_poly_cond(odd_predicate, offset=1) poly_even_object = poly_object.sub_poly_cond(even_predicate, offset=1) sollya.settings.display = sollya.hexadecimal Log.report(Log.Info, "poly_error: {}".format(poly_error)) Log.report(Log.Info, "poly_odd: {}".format(poly_odd_object)) Log.report(Log.Info, "poly_even: {}".format(poly_even_object)) poly_odd = PolynomialSchemeEvaluator.generate_horner_scheme( poly_odd_object, abs_vx) poly_odd.set_attributes(tag="poly_odd", debug=debug_multi) poly_even = PolynomialSchemeEvaluator.generate_horner_scheme( poly_even_object, abs_vx) poly_even.set_attributes(tag="poly_even", debug=debug_multi) exact_sum = poly_odd + poly_even exact_sum.set_attributes(tag="exact_sum", debug=debug_multi) # poly_even should be (1 + poly_even) result = vx + vx * exact_sum result.set_attributes(tag="result", precision=self.precision, debug=debug_multi) else: raise NotImplementedError if not offset is None: result = result + offset std_scheme = Statement(Return(result)) scheme = std_scheme return scheme
def generate_scheme(self): # We wish to compute vx / vy vx = self.implementation.add_input_variable( "x", self.precision, interval=self.input_intervals[0]) vy = self.implementation.add_input_variable( "y", self.precision, interval=self.input_intervals[1]) # maximum exponent magnitude (to avoid overflow/ underflow during # intermediary computations int_prec = self.precision.get_integer_format() max_exp_mag = Constant(self.precision.get_emax() - 1, precision=int_prec) exact_ex = ExponentExtraction(vx, tag="exact_ex", precision=int_prec, debug=debug_multi) exact_ey = ExponentExtraction(vy, tag="exact_ey", precision=int_prec, debug=debug_multi) ex = Max(Min(exact_ex, max_exp_mag, precision=int_prec), -max_exp_mag, tag="ex", precision=int_prec) ey = Max(Min(exact_ey, max_exp_mag, precision=int_prec), -max_exp_mag, tag="ey", precision=int_prec) Attributes.set_default_rounding_mode(ML_RoundToNearest) Attributes.set_default_silent(True) # computing the inverse square root init_approx = None scaling_factor_x = ExponentInsertion(-ex, tag="sfx_ei", precision=self.precision, debug=debug_multi) scaling_factor_y = ExponentInsertion(-ey, tag="sfy_ei", precision=self.precision, debug=debug_multi) def test_interval_out_of_bound_risk(x_range, y_range): """ Try to determine from x and y's interval if there is a risk of underflow or overflow """ div_range = abs(x_range / y_range) underflow_risk = sollya.inf(div_range) < S2**( self.precision.get_emin_normal() + 2) overflow_risk = sollya.sup(div_range) > S2**( self.precision.get_emax() - 2) return underflow_risk or overflow_risk out_of_bound_risk = (self.input_intervals[0] is None or self.input_intervals[1] is None ) or test_interval_out_of_bound_risk( self.input_intervals[0], self.input_intervals[1]) Log.report(Log.Debug, "out_of_bound_risk: {}".format(out_of_bound_risk)) # scaled version of vx and vy, to avoid overflow and underflow if out_of_bound_risk: scaled_vx = vx * scaling_factor_x scaled_vy = vy * scaling_factor_y scaled_interval = MetaIntervalList( [MetaInterval(Interval(-2, -1)), MetaInterval(Interval(1, 2))]) scaled_vx.set_attributes(tag="scaled_vx", debug=debug_multi, interval=scaled_interval) scaled_vy.set_attributes(tag="scaled_vy", debug=debug_multi, interval=scaled_interval) seed_interval = 1 / scaled_interval print("seed_interval=1/{}={}".format(scaled_interval, seed_interval)) else: scaled_vx = vx scaled_vy = vy seed_interval = 1 / scaled_vy.get_interval() # We need a first approximation to 1 / scaled_vy dummy_seed = ReciprocalSeed(EmptyOperand(precision=self.precision), precision=self.precision) if self.processor.is_supported_operation(dummy_seed, self.language): init_approx = ReciprocalSeed(scaled_vy, precision=self.precision, tag="init_approx", debug=debug_multi) else: # generate tabulated version of seed raise NotImplementedError current_approx_std = init_approx # correctly-rounded inverse computation num_iteration = self.num_iter Attributes.unset_default_rounding_mode() Attributes.unset_default_silent() # check if inputs are zeros x_zero = Test(vx, specifier=Test.IsZero, likely=False, precision=ML_Bool) y_zero = Test(vy, specifier=Test.IsZero, likely=False, precision=ML_Bool) comp_sign = Test(vx, vy, specifier=Test.CompSign, tag="comp_sign", debug=debug_multi) # check if divisor is NaN y_nan = Test(vy, specifier=Test.IsNaN, likely=False, precision=ML_Bool) # check if inputs are signaling NaNs x_snan = Test(vx, specifier=Test.IsSignalingNaN, likely=False, precision=ML_Bool) y_snan = Test(vy, specifier=Test.IsSignalingNaN, likely=False, precision=ML_Bool) # check if inputs are infinities x_inf = Test(vx, specifier=Test.IsInfty, likely=False, tag="x_inf", precision=ML_Bool) y_inf = Test(vy, specifier=Test.IsInfty, likely=False, tag="y_inf", debug=debug_multi, precision=ML_Bool) scheme = None gappa_vx, gappa_vy = None, None # initial reciprocal approximation of 1.0 / scaled_vy inv_iteration_list, recp_approx = compute_reduced_reciprocal( init_approx, scaled_vy, self.num_iter) recp_approx.set_attributes(tag="recp_approx", debug=debug_multi) # approximation of scaled_vx / scaled_vy yerr_last, reduced_div_approx, div_iteration_list = compute_reduced_division( scaled_vx, scaled_vy, recp_approx) eval_error_range, div_eval_error_range = self.solve_eval_error( init_approx, recp_approx, reduced_div_approx, scaled_vx, scaled_vy, inv_iteration_list, div_iteration_list, S2**-7, seed_interval) eval_error = sup(abs(eval_error_range)) recp_interval = 1 / scaled_vy.get_interval() + eval_error_range recp_approx.set_interval(recp_interval) div_interval = scaled_vx.get_interval() / scaled_vy.get_interval( ) + div_eval_error_range reduced_div_approx.set_interval(div_interval) reduced_div_approx.set_tag("reduced_div_approx") if out_of_bound_risk: unscaled_result = scaling_div_result(reduced_div_approx, ex, scaling_factor_y, self.precision) subnormal_result = subnormalize_result(recp_approx, reduced_div_approx, ex, ey, yerr_last, self.precision) else: unscaled_result = reduced_div_approx subnormal_result = reduced_div_approx x_inf_or_nan = Test(vx, specifier=Test.IsInfOrNaN, likely=False) y_inf_or_nan = Test(vy, specifier=Test.IsInfOrNaN, likely=False, tag="y_inf_or_nan", debug=debug_multi) # generate IEEE exception raising only of libm-compliant # mode is enabled enable_raise = self.libm_compliant # managing special cases # x inf and y inf pre_scheme = ConditionBlock( x_inf_or_nan, ConditionBlock( x_inf, ConditionBlock( y_inf_or_nan, Statement( # signaling NaNs raise invalid operation flags ConditionBlock(y_snan, Raise(ML_FPE_Invalid)) if enable_raise else Statement(), Return(FP_QNaN(self.precision)), ), ConditionBlock(comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)))), Statement( ConditionBlock(x_snan, Raise(ML_FPE_Invalid)) if enable_raise else Statement(), Return(FP_QNaN(self.precision)))), ConditionBlock( x_zero, ConditionBlock( LogicalOr(y_zero, y_nan, precision=ML_Bool), Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)) if enable_raise else Statement(), Return(FP_QNaN(self.precision))), Return(vx)), ConditionBlock( y_inf_or_nan, ConditionBlock( y_inf, Return( Select(comp_sign, FP_MinusZero(self.precision), FP_PlusZero(self.precision))), Statement( ConditionBlock(y_snan, Raise(ML_FPE_Invalid)) if enable_raise else Statement(), Return(FP_QNaN(self.precision)))), ConditionBlock( y_zero, Statement( Raise(ML_FPE_DivideByZero) if enable_raise else Statement(), ConditionBlock( comp_sign, Return(FP_MinusInfty(self.precision)), Return(FP_PlusInfty(self.precision)))), # managing numerical value result cases Statement( recp_approx, reduced_div_approx, ConditionBlock( Test(unscaled_result, specifier=Test.IsSubnormal, likely=False), # result is subnormal Statement( # inexact flag should have been raised when computing yerr_last # ConditionBlock( # Comparison( # yerr_last, 0, # specifier=Comparison.NotEqual, likely=True), # Statement(Raise(ML_FPE_Inexact, ML_FPE_Underflow)) #), Return(subnormal_result), ), # result is normal Statement( # inexact flag should have been raised when computing yerr_last #ConditionBlock( # Comparison( # yerr_last, 0, # specifier=Comparison.NotEqual, likely=True), # Raise(ML_FPE_Inexact) #), Return(unscaled_result))), ))))) # managing rounding mode save and restore # to ensure intermediary computations are performed in round-to-nearest # clearing exception before final computation #rnd_mode = GetRndMode() #scheme = Statement( # rnd_mode, # SetRndMode(ML_RoundToNearest), # yerr_last, # SetRndMode(rnd_mode), # unscaled_result, # ClearException(), # pre_scheme #) scheme = pre_scheme return scheme
def generate_expr(self, code_object, optree, folded=True, result_var=None, initial=False, language=None, force_variable_storing=False): """ code generation function """ language = self.language if language is None else language # search if <optree> has already been processed if self.has_memoization(optree): return self.get_memoization(optree) result = None # implementation generation if isinstance(optree, CodeVariable): result = optree elif isinstance(optree, Variable): if optree.get_var_type() is Variable.Local: final_var = code_object.get_free_var_name( optree.get_precision(), prefix=optree.get_tag(), declare=True) result = CodeVariable(final_var, optree.get_precision()) else: result = CodeVariable(optree.get_tag(), optree.get_precision()) elif isinstance(optree, ML_NewTable): # Implementing LeafNode ML_NewTable generation support table = optree tag = table.get_tag() table_name = code_object.declare_table( table, prefix=tag if tag != None else "table") result = CodeVariable(table_name, table.get_precision()) elif isinstance(optree, SwitchBlock): switch_value = optree.inputs[0] # generating pre_statement self.generate_expr(code_object, optree.get_pre_statement(), folded=folded, language=language) switch_value_code = self.generate_expr(code_object, switch_value, folded=folded, language=language) case_map = optree.get_case_map() code_object << "\nswitch(%s) {\n" % switch_value_code.get() for case in case_map: case_value = case case_statement = case_map[case] if isinstance(case_value, tuple): for sub_case in case: code_object << "case %s:\n" % sub_case else: code_object << "case %s:\n" % case code_object.open_level() self.generate_expr(code_object, case_statement, folded=folded, language=language) code_object.close_level() code_object << "}\n" return None elif isinstance(optree, ReferenceAssign): output_var = optree.inputs[0] result_value = optree.inputs[1] output_var_code = self.generate_expr(code_object, output_var, folded=False, language=language) if isinstance(result_value, Constant): # generate assignation result_value_code = self.generate_expr(code_object, result_value, folded=folded, language=language) code_object << self.generate_assignation( output_var_code.get(), result_value_code.get()) else: result_value_code = self.generate_expr(code_object, result_value, folded=folded, language=language) code_object << self.generate_assignation( output_var_code.get(), result_value_code.get()) if optree.get_debug() and not self.disable_debug: code_object << self.generate_debug_msg( result_value, result_value_code, code_object, debug_object=optree.get_debug()) #code_object << self.generate_assignation(output_var_code.get(), result_value_code.get()) #code_object << output_var.get_precision().generate_c_assignation(output_var_code, result_value_code) return None elif isinstance(optree, Loop): init_statement = optree.inputs[0] exit_condition = optree.inputs[1] loop_body = optree.inputs[2] self.generate_expr(code_object, init_statement, folded=folded, language=language) code_object << "\nfor (;%s;)" % self.generate_expr( code_object, exit_condition, folded=False, language=language).get() code_object.open_level() self.generate_expr(code_object, loop_body, folded=folded, language=language) code_object.close_level() return None elif isinstance(optree, ConditionBlock): condition = optree.inputs[0] if_branch = optree.inputs[1] else_branch = optree.inputs[2] if len(optree.inputs) > 2 else None # generating pre_statement self.generate_expr(code_object, optree.get_pre_statement(), folded=folded, language=language) cond_code = self.generate_expr(code_object, condition, folded=folded, language=language) if isinstance(condition, BooleanOperation): cond_likely = condition.get_likely() else: # TODO To be refined (for example Constant(True) # should be associated with likely True cond_likely = None Log.report( Log.Warning, " The following condition has no (usable) likely attribute: {}", condition, ) if cond_likely in [True, False]: code_object << "\nif (__builtin_expect(%s, %d)) " % ( cond_code.get(), { True: 1, False: 0 }[condition.get_likely()]) else: code_object << "\nif (%s) " % cond_code.get() self.open_memoization_level() code_object.open_level() #if_branch_code = self.processor.generate_expr(self, code_object, if_branch, if_branch.inputs, folded) if_branch_code = self.generate_expr(code_object, if_branch, folded=folded, language=language) code_object.close_level(cr="") self.close_memoization_level() if else_branch: code_object << " else " code_object.open_level() self.open_memoization_level() else_branch_code = self.generate_expr(code_object, else_branch, folded=folded, language=language) code_object.close_level() self.close_memoization_level() else: code_object << "\n" return None elif isinstance(optree, Return): if len(optree.inputs) == 0: # void return code_object << "return;\n" else: return_result = optree.inputs[0] return_code = self.generate_expr(code_object, return_result, folded=folded, language=language) code_object << "return %s;\n" % return_code.get() return None #return_code elif isinstance(optree, ExceptionOperation): if optree.get_specifier() in [ ExceptionOperation.RaiseException, ExceptionOperation.ClearException, ExceptionOperation.RaiseReturn ]: result_code = self.processor.generate_expr( self, code_object, optree, optree.inputs, folded=False, result_var=result_var, language=language) code_object << "%s;\n" % result_code.get() if optree.get_specifier() == ExceptionOperation.RaiseReturn: if self.libm_compliant: # libm compliant exception management code_object.add_header( "support_lib/ml_libm_compatibility.h") return_value = self.generate_expr( code_object, optree.get_return_value(), folded=folded, language=language) arg_value = self.generate_expr(code_object, optree.get_arg_value(), folded=folded, language=language) function_name = optree.function_name exception_list = [ op.get_value() for op in optree.inputs ] if ML_FPE_Inexact in exception_list: exception_list.remove(ML_FPE_Inexact) if len(exception_list) > 1: raise NotImplementedError if ML_FPE_Overflow in exception_list: code_object << "return ml_raise_libm_overflowf(%s, %s, \"%s\");\n" % ( return_value.get(), arg_value.get(), function_name) elif ML_FPE_Underflow in exception_list: code_object << "return ml_raise_libm_underflowf(%s, %s, \"%s\");\n" % ( return_value.get(), arg_value.get(), function_name) elif ML_FPE_Invalid in exception_list: code_object << "return %s;\n" % return_value.get() else: return_precision = optree.get_return_value( ).get_precision() self.generate_expr(code_object, Return(optree.get_return_value(), precision=return_precision), folded=folded, language=language) return None else: result = self.processor.generate_expr(self, code_object, optree, optree.inputs, folded=folded, result_var=result_var, language=language) elif isinstance(optree, NoResultOperation): result_code = self.processor.generate_expr(self, code_object, optree, optree.inputs, folded=False, result_var=result_var, language=language) code_object << "%s;\n" % result_code.get() return None elif isinstance(optree, PlaceHolder): head = optree.get_input(0) for tail_node in optree.inputs[1:]: if not self.has_memoization(tail_node): self.generate_expr(code_object, tail_node, folded=folded, initial=True, language=language) # generate PlaceHolder's main_value head_code = self.generate_expr(code_object, head, folded=folded, initial=initial, language=language) return head_code elif isinstance(optree, Statement): for op in optree.inputs: if not self.has_memoization(op): self.generate_expr(code_object, op, folded=folded, initial=True, language=language) return None elif isinstance(optree, Constant): generate_pre_process = self.generate_clear_exception if optree.get_clearprevious( ) else None result = self.processor.generate_expr( self, code_object, optree, [], generate_pre_process=generate_pre_process, folded=folded, result_var=result_var, language=language) else: generate_pre_process = self.generate_clear_exception if optree.get_clearprevious( ) else None result = self.processor.generate_expr( self, code_object, optree, optree.inputs, generate_pre_process=generate_pre_process, folded=folded, result_var=result_var, language=language) # registering result into memoization table self.add_memoization(optree, result) # debug management if optree.get_debug() and not self.disable_debug: code_object << self.generate_debug_msg(optree, result, code_object) if initial and not isinstance(result, CodeVariable) and not result is None: final_var = result_var if result_var else code_object.get_free_var_name( optree.get_precision(), prefix="result", declare=True) code_object << self.generate_assignation(final_var, result.get()) return CodeVariable(final_var, optree.get_precision()) return result
def generate_scalar_scheme(self, vx, vy): # fixing inputs' node tag vx.set_attributes(tag="x") vy.set_attributes(tag="y") int_precision = self.precision.get_integer_format() # assuming x = m.2^e (m in [1, 2[) # n, positive or null integers # # pow(x, n) = x^(y) # = exp(y * log(x)) # = 2^(y * log2(x)) # = 2^(y * (log2(m) + e)) # e = ExponentExtraction(vx, tag="e", precision=int_precision) m = MantissaExtraction(vx, tag="m", precision=self.precision) # approximation log2(m) # retrieving processor inverse approximation table dummy_var = Variable("dummy", precision = self.precision) dummy_div_seed = ReciprocalSeed(dummy_var, precision = self.precision) inv_approx_table = self.processor.get_recursive_implementation( dummy_div_seed, language=None, table_getter= lambda self: self.approx_table_map) log_f = sollya.log(sollya.x) # /sollya.log(self.basis) ml_log_args = ML_GenericLog.get_default_args(precision=self.precision, basis=2) ml_log = ML_GenericLog(ml_log_args) log_table, log_table_tho, table_index_range = ml_log.generate_log_table(log_f, inv_approx_table) log_approx = ml_log.generate_reduced_log_split(Abs(m, precision=self.precision), log_f, inv_approx_table, log_table) log_approx = Select(Equal(vx, 0), FP_MinusInfty(self.precision), log_approx) log_approx.set_attributes(tag="log_approx", debug=debug_multi) r = Multiplication(log_approx, vy, tag="r", debug=debug_multi) # 2^(y * (log2(m) + e)) = 2^(y * log2(m)) * 2^(y * e) # # log_approx = log2(Abs(m)) # r = y * log_approx ~ y * log2(m) # # NOTES: manage cases where e is negative and # (y * log2(m)) AND (y * e) could cancel out # if e positive, whichever the sign of y (y * log2(m)) and (y * e) CANNOT # be of opposite signs # log2(m) in [0, 1[ so cancellation can occur only if e == -1 # we split 2^x in 2^x = 2^t0 * 2^t1 # if e < 0: t0 = y * (log2(m) + e), t1=0 # else: t0 = y * log2(m), t1 = y * e t_cond = e < 0 # e_y ~ e * y e_f = Conversion(e, precision=self.precision) #t0 = Select(t_cond, (e_f + log_approx) * vy, Multiplication(e_f, vy), tag="t0") #NearestInteger(t0, precision=self.precision, tag="t0_int") EY = NearestInteger(e_f * vy, tag="EY", precision=self.precision) LY = NearestInteger(log_approx * vy, tag="LY", precision=self.precision) t0_int = Select(t_cond, EY + LY, EY, tag="t0_int") t0_frac = Select(t_cond, FMA(e_f, vy, -EY) + FMA(log_approx, vy, -LY) ,EY - t0_int, tag="t0_frac") #t0_frac.set_attributes(tag="t0_frac") ml_exp2_args = ML_Exp2.get_default_args(precision=self.precision) ml_exp2 = ML_Exp2(ml_exp2_args) exp2_t0_frac = ml_exp2.generate_scalar_scheme(t0_frac, inline_select=True) exp2_t0_frac.set_attributes(tag="exp2_t0_frac", debug=debug_multi) exp2_t0_int = ExponentInsertion(Conversion(t0_int, precision=int_precision), precision=self.precision, tag="exp2_t0_int") t1 = Select(t_cond, Constant(0, precision=self.precision), r) exp2_t1 = ml_exp2.generate_scalar_scheme(t1, inline_select=True) exp2_t1.set_attributes(tag="exp2_t1", debug=debug_multi) result_sign = Constant(1.0, precision=self.precision) # Select(n_is_odd, CopySign(vx, Constant(1.0, precision=self.precision)), 1) y_int = NearestInteger(vy, precision=self.precision) y_is_integer = Equal(y_int, vy) y_is_even = LogicalOr( # if y is a number (exc. inf) greater than 2**mantissa_size * 2, # then it is an integer multiple of 2 => even Abs(vy) >= 2**(self.precision.get_mantissa_size()+1), LogicalAnd( y_is_integer and Abs(vy) < 2**(self.precision.get_mantissa_size()+1), # we want to limit the modulo computation to an integer input Equal(Modulo(Conversion(y_int, precision=int_precision), 2), 0) ) ) y_is_odd = LogicalAnd( LogicalAnd( Abs(vy) < 2**(self.precision.get_mantissa_size()+1), y_is_integer ), Equal(Modulo(Conversion(y_int, precision=int_precision), 2), 1) ) # special cases management special_case_results = Statement( # x is sNaN OR y is sNaN ConditionBlock( LogicalOr(Test(vx, specifier=Test.IsSignalingNaN), Test(vy, specifier=Test.IsSignalingNaN)), Return(FP_QNaN(self.precision)) ), # pow(x, ±0) is 1 if x is not a signaling NaN ConditionBlock( Test(vy, specifier=Test.IsZero), Return(Constant(1.0, precision=self.precision)) ), # pow(±0, y) is ±∞ and signals the divideByZero exception for y an odd integer <0 ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(y_is_odd, vy < 0)), Return(Select(Test(vx, specifier=Test.IsPositiveZero), FP_PlusInfty(self.precision), FP_MinusInfty(self.precision))), ), # pow(±0, −∞) is +∞ with no exception ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), Test(vy, specifier=Test.IsNegativeInfty)), Return(FP_MinusInfty(self.precision)), ), # pow(±0, +∞) is +0 with no exception ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), Test(vy, specifier=Test.IsPositiveInfty)), Return(FP_PlusInfty(self.precision)), ), # pow(±0, y) is ±0 for finite y>0 an odd integer ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(y_is_odd, vy > 0)), Return(vx), ), # pow(−1, ±∞) is 1 with no exception ConditionBlock( LogicalAnd(Equal(vx, -1), Test(vy, specifier=Test.IsInfty)), Return(Constant(1.0, precision=self.precision)), ), # pow(+1, y) is 1 for any y (even a quiet NaN) ConditionBlock( vx == 1, Return(Constant(1.0, precision=self.precision)), ), # pow(x, +∞) is +0 for −1<x<1 ConditionBlock( LogicalAnd(Abs(vx) < 1, Test(vy, specifier=Test.IsPositiveInfty)), Return(FP_PlusZero(self.precision)) ), # pow(x, +∞) is +∞ for x<−1 or for 1<x (including ±∞) ConditionBlock( LogicalAnd(Abs(vx) > 1, Test(vy, specifier=Test.IsPositiveInfty)), Return(FP_PlusInfty(self.precision)) ), # pow(x, −∞) is +∞ for −1<x<1 ConditionBlock( LogicalAnd(Abs(vx) < 1, Test(vy, specifier=Test.IsNegativeInfty)), Return(FP_PlusInfty(self.precision)) ), # pow(x, −∞) is +0 for x<−1 or for 1<x (including ±∞) ConditionBlock( LogicalAnd(Abs(vx) > 1, Test(vy, specifier=Test.IsNegativeInfty)), Return(FP_PlusZero(self.precision)) ), # pow(+∞, y) is +0 for a number y < 0 ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsPositiveInfty), vy < 0), Return(FP_PlusZero(self.precision)) ), # pow(+∞, y) is +∞ for a number y > 0 ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsPositiveInfty), vy > 0), Return(FP_PlusInfty(self.precision)) ), # pow(−∞, y) is −0 for finite y < 0 an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(y_is_odd, vy < 0)), Return(FP_MinusZero(self.precision)), ), # pow(−∞, y) is −∞ for finite y > 0 an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(y_is_odd, vy > 0)), Return(FP_MinusInfty(self.precision)), ), # pow(−∞, y) is +0 for finite y < 0 and not an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(LogicalNot(y_is_odd), vy < 0)), Return(FP_PlusZero(self.precision)), ), # pow(−∞, y) is +∞ for finite y > 0 and not an odd integer # TODO: check y is finite ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsNegativeInfty), LogicalAnd(LogicalNot(y_is_odd), vy > 0)), Return(FP_PlusInfty(self.precision)), ), # pow(±0, y) is +∞ and signals the divideByZero exception for finite y<0 and not an odd integer # TODO: signal divideByZero exception ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(LogicalNot(y_is_odd), vy < 0)), Return(FP_PlusInfty(self.precision)), ), # pow(±0, y) is +0 for finite y>0 and not an odd integer ConditionBlock( LogicalAnd(Test(vx, specifier=Test.IsZero), LogicalAnd(LogicalNot(y_is_odd), vy > 0)), Return(FP_PlusZero(self.precision)), ), ) # manage n=1 separately to avoid catastrophic propagation of errors # between log2 and exp2 to eventually compute the identity function # test-case #3 result = Statement( special_case_results, # fallback default cases Return(result_sign * exp2_t1 * exp2_t0_int * exp2_t0_frac)) return result
def generate_scheme(self): # declaring function input variable v_x = [ self.implementation.add_input_variable( "x%d" % index, self.get_input_precision(index)) for index in range(self.arity) ] double_format = { ML_Binary32: ML_SingleSingle, ML_Binary64: ML_DoubleDouble }[self.precision] # testing Add211 exact_add = Addition(v_x[0], v_x[1], precision=double_format, tag="exact_add") # testing Mul211 exact_mul = Multiplication(v_x[0], v_x[1], precision=double_format, tag="exact_mul") # testing Sub211 exact_sub = Subtraction(v_x[1], v_x[0], precision=double_format, tag="exact_sub") # testing Add222 multi_add = Addition(exact_add, exact_sub, precision=double_format, tag="multi_add") # testing Mul222 multi_mul = Multiplication(multi_add, exact_mul, precision=double_format, tag="multi_mul") # testing Add221 and Add212 and Sub222 multi_sub = Subtraction(Addition(exact_sub, v_x[1], precision=double_format, tag="add221"), Addition(v_x[0], multi_mul, precision=double_format, tag="add212"), precision=double_format, tag="sub222") # testing Mul212 and Mul221 mul212 = Multiplication(multi_sub, v_x[0], precision=double_format, tag="mul212") mul221 = Multiplication(exact_mul, v_x[1], precision=double_format, tag="mul221") # testing Sub221 and Sub212 sub221 = Subtraction(mul212, mul221.hi, precision=double_format, tag="sub221") sub212 = Subtraction(sub221, mul212.lo, precision=double_format, tag="sub212") # testing FMA2111 fma2111 = FMA(sub221.lo, sub212.hi, mul221.hi, precision=double_format, tag="fma2111") # testing FMA2112 fma2112 = FMA(fma2111.lo, fma2111.hi, fma2111, precision=double_format, tag="fma2112") # testing FMA2212 fma2212 = FMA(fma2112, fma2112.hi, fma2112, precision=double_format, tag="fma2212") # testing FMA2122 fma2122 = FMA(fma2212.lo, fma2212, fma2212, precision=double_format, tag="fma2122") # testing FMA22222 fma2222 = FMA(fma2122, fma2212, fma2111, precision=double_format, tag="fma2222") # testing Add122 add122 = Addition(fma2222, fma2222, precision=self.precision, tag="add122") # testing Add112 add112 = Addition(add122, fma2222, precision=self.precision, tag="add112") # testing Add121 add121 = Addition(fma2222, add112, precision=self.precision, tag="add121") # testing subnormalization multi_subnormalize = SpecificOperation( Addition(add121, add112, precision=double_format), Constant(3, precision=self.precision.get_integer_format()), specifier=SpecificOperation.Subnormalize, precision=double_format, tag="multi_subnormalize") result = Conversion(multi_subnormalize, precision=self.precision) scheme = Statement(Return(result)) return scheme
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = self.implementation.add_input_variable("x", self.precision) table_size_log = self.table_size_log integer_size = 31 integer_precision = ML_Int32 max_bound = sup(abs(self.input_intervals[0])) max_bound_log = int(ceil(log2(max_bound))) Log.report(Log.Info, "max_bound_log=%s " % max_bound_log) scaling_power = integer_size - max_bound_log Log.report(Log.Info, "scaling power: %s " % scaling_power) storage_precision = ML_Custom_FixedPoint_Format(1, 30, signed=True) Log.report(Log.Info, "tabulating cosine and sine") # cosine and sine fused table fused_table = ML_NewTable( dimensions=[2**table_size_log, 2], storage_precision=storage_precision, tag="fast_lib_shared_table") # self.uniquify_name("cossin_table")) # filling table for i in range(2**table_size_log): local_x = i / S2**table_size_log * S2**max_bound_log cos_local = cos( local_x ) # nearestint(cos(local_x) * S2**storage_precision.get_frac_size()) sin_local = sin( local_x ) # nearestint(sin(local_x) * S2**storage_precision.get_frac_size()) fused_table[i][0] = cos_local fused_table[i][1] = sin_local # argument reduction evaluation scheme # scaling_factor = Constant(S2**scaling_power, precision = self.precision) red_vx_precision = ML_Custom_FixedPoint_Format(31 - scaling_power, scaling_power, signed=True) Log.report( Log.Verbose, "red_vx_precision.get_c_bit_size()=%d" % red_vx_precision.get_c_bit_size()) # red_vx = NearestInteger(vx * scaling_factor, precision = integer_precision) red_vx = Conversion(vx, precision=red_vx_precision, tag="red_vx", debug=debug_fixed32) computation_precision = red_vx_precision # self.precision output_precision = self.get_output_precision() Log.report(Log.Info, "computation_precision is %s" % computation_precision) Log.report(Log.Info, "storage_precision is %s" % storage_precision) Log.report(Log.Info, "output_precision is %s" % output_precision) hi_mask_value = 2**32 - 2**(32 - table_size_log - 1) hi_mask = Constant(hi_mask_value, precision=ML_Int32) Log.report(Log.Info, "hi_mask=0x%x" % hi_mask_value) red_vx_hi_int = BitLogicAnd(TypeCast(red_vx, precision=ML_Int32), hi_mask, precision=ML_Int32, tag="red_vx_hi_int", debug=debugd) red_vx_hi = TypeCast(red_vx_hi_int, precision=red_vx_precision, tag="red_vx_hi", debug=debug_fixed32) red_vx_lo = red_vx - red_vx_hi red_vx_lo.set_attributes(precision=red_vx_precision, tag="red_vx_lo", debug=debug_fixed32) table_index = BitLogicRightShift(TypeCast(red_vx, precision=ML_Int32), scaling_power - (table_size_log - max_bound_log), precision=ML_Int32, tag="table_index", debug=debugd) tabulated_cos = TableLoad(fused_table, table_index, 0, tag="tab_cos", precision=storage_precision, debug=debug_fixed32) tabulated_sin = TableLoad(fused_table, table_index, 1, tag="tab_sin", precision=storage_precision, debug=debug_fixed32) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) Log.report(Log.Info, "building polynomial approximation for cosine") # cosine polynomial approximation poly_interval = Interval(0, S2**(max_bound_log - table_size_log)) Log.report(Log.Info, "poly_interval=%s " % poly_interval) cos_poly_degree = 2 # int(sup(guessdegree(cos(x), poly_interval, accuracy_goal))) Log.report(Log.Verbose, "cosine polynomial approximation") cos_poly_object, cos_approx_error = Polynomial.build_from_approximation_with_error( cos(sollya.x), [0, 2], [0] + [computation_precision.get_bit_size()], poly_interval, sollya.absolute, error_function=error_function) #cos_eval_scheme = PolynomialSchemeEvaluator.generate_horner_scheme(cos_poly_object, red_vx_lo, unified_precision = computation_precision) Log.report(Log.Info, "cos_approx_error=%e" % cos_approx_error) cos_coeff_list = cos_poly_object.get_ordered_coeff_list() coeff_C0 = cos_coeff_list[0][1] coeff_C2 = Constant(cos_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) Log.report(Log.Info, "building polynomial approximation for sine") # sine polynomial approximation sin_poly_degree = 2 # int(sup(guessdegree(sin(x)/x, poly_interval, accuracy_goal))) Log.report(Log.Info, "sine poly degree: %e" % sin_poly_degree) Log.report(Log.Verbose, "sine polynomial approximation") sin_poly_object, sin_approx_error = Polynomial.build_from_approximation_with_error( sin(sollya.x) / sollya.x, [0, 2], [0] + [computation_precision.get_bit_size()] * (sin_poly_degree + 1), poly_interval, sollya.absolute, error_function=error_function) sin_coeff_list = sin_poly_object.get_ordered_coeff_list() coeff_S0 = sin_coeff_list[0][1] coeff_S2 = Constant(sin_coeff_list[1][1], precision=ML_Custom_FixedPoint_Format(-1, 32, signed=True)) # scheme selection between sine and cosine if self.cos_output: scheme = self.generate_cos_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) else: scheme = self.generate_sin_scheme(computation_precision, tabulated_cos, tabulated_sin, coeff_S2, coeff_C2, red_vx_lo) result = Conversion(scheme, precision=self.get_output_precision()) Log.report( Log.Verbose, "result operation tree :\n %s " % result.get_str( display_precision=True, depth=None, memoization_map={})) scheme = Statement(Return(result)) return scheme
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name if self.libm_compliant: return RaiseReturn(*args, precision=self.precision, **kwords) else: return Return(kwords["return_value"], precision=self.precision) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=debug_multi, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=debug_multi, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=debug_multi, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock( test_positive, Return(FP_PlusInfty(self.precision), precision=self.precision), Return(FP_PlusZero(self.precision), precision=self.precision))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock( test_signaling_nan, return_snan, Return(FP_QNaN(self.precision), precision=self.precision)), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = sollya.ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely=False, specifier=Comparison.Greater) early_overflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2**precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely=False, specifier=Comparison.Less) early_underflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value=FP_PlusZero(self.precision))) # constant computation invlog2 = self.precision.round_sollya_object(1 / log(2), sollya.RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), sollya.ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - ( sollya.ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: %d" % log2_hi_precision) invlog2_cst = Constant(invlog2, precision=self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = self.precision.round_sollya_object( log(2) - log2_hi, sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag="unround_k", debug=debug_multi) k = NearestInteger(unround_k, precision=self.precision, debug=debug_multi) ik = NearestInteger(unround_k, precision=self.precision.get_integer_format(), debug=debug_multi, tag="ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact=True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact=True, tag="exact_hi", debug=debug_multi, prevent_optimization=True) exact_lo_part = -k * log2_lo exact_lo_part.set_attributes(tag="exact_lo", debug=debug_multi, prevent_optimization=True) r = exact_hi_part + exact_lo_part r.set_tag("r") r.set_attributes(debug=debug_multi) approx_interval = Interval(-log(2) / 2, log(2) / 2) approx_interval_half = approx_interval / 2 approx_interval_split = [ Interval(-log(2) / 2, inf(approx_interval_half)), approx_interval_half, Interval(sup(approx_interval_half), log(2) / 2) ] # TODO: should be computed automatically exact_hi_interval = approx_interval exact_lo_interval = -interval_k * log2_lo opt_r = self.optimise_scheme(r, copy={}) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision=self.precision, interval=interval_vx), tag_map["k"]: Variable("k", interval=interval_k, precision=self.precision) } #try: if is_gappa_installed(): eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_r, cg_eval_error_copy_map, gappa_filename="red_arg.g") else: eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "eval error: %s" % eval_error) local_ulp = sup(ulp(sollya.exp(approx_interval), self.precision)) # FIXME refactor error_goal from accuracy Log.report(Log.Info, "accuracy: %s" % self.accuracy) if isinstance(self.accuracy, ML_Faithful): error_goal = local_ulp elif isinstance(self.accuracy, ML_CorrectlyRounded): error_goal = S2**-1 * local_ulp elif isinstance(self.accuracy, ML_DegradedAccuracyAbsolute): error_goal = self.accuracy.goal elif isinstance(self.accuracy, ML_DegradedAccuracyRelative): error_goal = self.accuracy.goal else: Log.report(Log.Error, "unknown accuracy: %s" % self.accuracy) # error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = max( sup( guessdegree( expm1(sollya.x) / sollya.x, approx_interval, error_goal_approx)) - 1, 2) init_poly_degree = poly_degree error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) precision_list = [1] + [self.precision] * (poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( expm1(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "polynomial: %s " % poly_object) sub_poly = poly_object.sub_poly(start_index=2) Log.report(Log.Info, "polynomial: %s " % sub_poly) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report( Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") pre_poly = polynomial_scheme_builder( poly_object, r, unified_precision=self.precision) pre_poly.set_attributes(tag="pre_poly", debug=debug_multi) pre_sub_poly = polynomial_scheme_builder( sub_poly, r, unified_precision=self.precision) pre_sub_poly.set_attributes(tag="pre_sub_poly", debug=debug_multi) poly = 1 + (exact_hi_part + (exact_lo_part + pre_sub_poly)) poly.set_tag("poly") # optimizing poly before evaluation error computation #opt_poly = self.opt_engine.optimization_process(poly, self.precision, fuse_fma = fuse_fma) #opt_sub_poly = self.opt_engine.optimization_process(pre_sub_poly, self.precision, fuse_fma = fuse_fma) opt_poly = self.optimise_scheme(poly) opt_sub_poly = self.optimise_scheme(pre_sub_poly) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision=self.precision, interval=approx_interval) exact_hi_gappa_var = Variable("exact_hi", precision=self.precision, interval=exact_hi_interval) exact_lo_gappa_var = Variable("exact_lo", precision=self.precision, interval=exact_lo_interval) vx_gappa_var = Variable("x", precision=self.precision, interval=interval_vx) k_gappa_var = Variable("k", interval=interval_k, precision=self.precision) #print "exact_hi interval: ", exact_hi_interval sub_poly_error_copy_map = { #r.get_handle().get_node(): r_gappa_var, #vx.get_handle().get_node(): vx_gappa_var, exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, #k.get_handle().get_node(): k_gappa_var, } poly_error_copy_map = { exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, } if is_gappa_installed(): sub_poly_eval_error = -1.0 sub_poly_eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_sub_poly, sub_poly_error_copy_map, gappa_filename="%s_gappa_sub_poly.g" % self.function_name) dichotomy_map = [ { exact_hi_part.get_handle().get_node(): approx_interval_split[0], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[1], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[2], }, ] poly_eval_error_dico = self.gappa_engine.get_eval_error_v3( self.opt_engine, opt_poly, poly_error_copy_map, gappa_filename="gappa_poly.g", dichotomy=dichotomy_map) poly_eval_error = max( [sup(abs(err)) for err in poly_eval_error_dico]) else: poly_eval_error = 0.0 sub_poly_eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "stopping autonomous degree research") # incrementing polynomial degree to counteract initial decrementation effect poly_degree += 1 break Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) Log.report(Log.Info, "sub poly evaluation error: %s" % sub_poly_eval_error) global_poly_error = None global_rel_poly_error = None for case_index in range(3): poly_error = poly_approx_error + poly_eval_error_dico[ case_index] rel_poly_error = sup( abs(poly_error / sollya.exp(approx_interval_split[case_index]))) if global_rel_poly_error == None or rel_poly_error > global_rel_poly_error: global_rel_poly_error = rel_poly_error global_poly_error = poly_error flag = error_goal > global_rel_poly_error if flag: break else: poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier=Comparison.Greater, likely=False, debug=debug_multi, tag="late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = Subtraction( ik, Constant(overflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), debug=debug_multi, tag="diff_k", ) late_overflow_result = (ExponentInsertion( diff_k, precision=self.precision) * poly) * ExponentInsertion( overflow_exp_offset, precision=self.precision) late_overflow_result.set_attributes(silent=False, tag="late_overflow_result", debug=debug_multi, precision=self.precision) late_overflow_return = ConditionBlock( Test(late_overflow_result, specifier=Test.IsInfty, likely=False), ExpRaiseReturn(ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision)), Return(late_overflow_result, precision=self.precision)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier=Comparison.LessOrEqual, likely=False) underflow_exp_offset = 2 * self.precision.get_field_size() corrected_exp = Addition( ik, Constant(underflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), tag="corrected_exp") late_underflow_result = ( ExponentInsertion(corrected_exp, precision=self.precision) * poly) * ExponentInsertion(-underflow_exp_offset, precision=self.precision) late_underflow_result.set_attributes(debug=debug_multi, tag="late_underflow_result", silent=False) test_subnormal = Test(late_underflow_result, specifier=Test.IsSubnormal) late_underflow_return = Statement( ConditionBlock( test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value=late_underflow_result)), Return(late_underflow_result, precision=self.precision)) twok = ExponentInsertion(ik, tag="exp_ik", debug=debug_multi, precision=self.precision) #std_result = twok * ((1 + exact_hi_part * pre_poly) + exact_lo_part * pre_poly) std_result = twok * poly std_result.set_attributes(tag="std_result", debug=debug_multi) result_scheme = ConditionBlock( late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result, precision=self.precision))) std_return = ConditionBlock( early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock( test_nan_or_inf, Statement(ClearException() if self.libm_compliant else Statement(), specific_return), std_return) return scheme
def generate_scalar_scheme(self, vx): """ Generating implementation script for hyperic tangent meta-function """ # tanh(x) = sinh(x) / cosh(x) # = (e^x - e^-x) / (e^x + e^-x) # = (e^(2x) - 1) / (e^(2x) + 1) # when x -> +inf, tanh(x) -> 1 # when x -> -inf, tanh(x) -> -1 # ~0 e^x ~ 1 + x - x^2 / 2 + x^3 / 6 + ... # e^(-x) ~ 1 - x - x^2 / 2- x^3/6 + ... # when x -> 0, tanh(x) ~ (2 (x + x^3/6 + ...)) / (2 - x^2 + ...) ~ x # We can divide the input interval into 3 parts # positive, around 0, and finally negative # Possible argument reduction # x = m.2^E = k * log(2) + r # (k != 0) => tanh(x) = (2k * e^(2r) - 1) / (2k * e^(2r) + 1) # = (1 - 1 * e^(-2r) / 2k) / (1 + e^(-2r) / 2k) # # tanh(x) = (e^(2x) - 1) / (e^(2x) + 1) # = (e^(2x) + 1 - 1- 1) / (e^(2x) + 1) # = 1 - 2 / (e^(2x) + 1) # tanh is odd so we reduce the computation to the absolute value of # vx abs_vx = Abs(vx, precision=self.precision) # if p is the expected output precision # x > (p+2) * log(2) / 2 => tanh(x) = 1 - eps # where eps < 1/2 * 2^-p p = self.precision.get_mantissa_size() high_bound = (p + 2) * sollya.log(2) / 2 near_zero_bound = 0.125 interval_num = 1024 Log.report(Log.Verbose, "high_bound={}, near_zero_bound={}, interval_num={}", float(high_bound), near_zero_bound, interval_num) interval_size = (high_bound - near_zero_bound) / (1024) new_interval_size = S2**int(sollya.log2(interval_size)) interval_num *= 2 high_bound = new_interval_size * interval_num + near_zero_bound Log.report(Log.Verbose, "high_bound={}, near_zero_bound={}, interval_num={}", float(high_bound), near_zero_bound, interval_num) ERROR_THRESHOLD = S2**-p Log.report(Log.Info, "ERROR_THRESHOLD={}", ERROR_THRESHOLD) # Near 0 approximation near_zero_scheme, near_zero_error = self.generate_approx_poly_near_zero( sollya.tanh(sollya.x), near_zero_bound, S2**-p, abs_vx) # approximation parameters poly_degree = 7 approx_interval = Interval(near_zero_bound, high_bound) sollya.settings.points = 117 approx_scheme, approx_error = piecewise_approximation( sollya.tanh, abs_vx, self.precision, bound_low=near_zero_bound, bound_high=high_bound, num_intervals=interval_num, max_degree=poly_degree, error_threshold=ERROR_THRESHOLD) Log.report(Log.Warning, "approx_error={}".format(approx_error)) comp_near_zero_bound = abs_vx < near_zero_bound comp_near_zero_bound.set_attributes(tag="comp_near_zero_bound", debug=debug_multi) comp_high_bound = abs_vx < high_bound comp_high_bound.set_attributes(tag="comp_high_bound", debug=debug_multi) complete_scheme = Select( comp_near_zero_bound, near_zero_scheme, Select(comp_high_bound, approx_scheme, Constant(1.0, precision=self.precision))) scheme = Return(Select(vx < 0, Negation(complete_scheme), complete_scheme), precision=self.precision) return scheme