def Add211(x, y): """ Multi-precision Addition (2sum) HI, LO = x + y TODO: missing assumption on input order """ zh = Addition(x, y) t1 = Subtraction(zh, x) zl = Subtraction(y, t1) return zh, zl
def Split(a, precision=None): """... splitting algorithm for Dekker TwoMul""" cst_value = {ML_Binary32: 4097, ML_Binary64: 134217729}[a.precision] s = Constant(cst_value, precision=a.get_precision(), tag='fp_split') c = Multiplication(s, a, precision=precision) tmp = Subtraction(a, c, precision=precision) ah = Addition(tmp, c, precision=precision) al = Subtraction(a, ah, precision=precision) return ah, al
def Add212(xh, yh, yl): """ Multi-precision Addition: HI, LO = xh + [yh:yl] """ r = Addition(xh, yh) s1 = Subtraction(xh, r) s2 = Addition(s1, yh) s = Addition(s2, yl) zh = Addition(r, s) zl = Addition(Subtraction(r, zh), s) return zh, zl
def generate_fasttwosum(vx, vy): """Return two optrees for a FastTwoSum operation. Precondition: |vx| >= |vy|. The return value is a tuple (sum, error). """ s = Addition(vx, vy) b = Subtraction(z, vx) e = Subtraction(vy, b) return s, e
def subnormalize_multi(x_list, factor, precision=None, fma=True): """ x_list is a multi-component number with components ordered from the most to the least siginificant. x_list[0] must be the rounded evaluation of (x_list[0] + x_list[1] + ...) @return the field of x as a floating-point number assuming the exponent of the result is exponent(x) + factor and managing field subnormalization if required """ x_hi = x_list[0] int_precision = precision.get_integer_format() ex = ExponentExtraction(x_hi, precision=int_precision) scaled_ex = Addition(ex, factor, precision=int_precision) CI0 = Constant(0, precision=int_precision) CI1 = Constant(1, precision=int_precision) # difference betwen x's real exponent and the minimal exponent # for a floating of format precision delta = Max(Min(Subtraction(Constant(precision.get_emin_normal(), precision=int_precision), scaled_ex, precision=int_precision), CI0, precision=int_precision), Constant(precision.get_field_size(), precision=int_precision), precision=int_precision) round_factor_exp = Addition(delta, ex, precision=int_precision) round_factor = ExponentInsertion(round_factor_exp, precision=precision) # to force a rounding as if x_hi was of precision p - delta # we use round_factor as follows: # o(o(round_factor + x_hi) - round_factor) if len(x_list) == 2: rounded_x_hi = Subtraction(Add112(round_factor, x_list[0], x_list[1], precision=precision)[0], round_factor, precision=precision) elif len(x_list) == 3: rounded_x_hi = Subtraction(Add113(round_factor, x_list[0], x_list[1], x_list[2], precision=precision)[0], round_factor, precision=precision) else: Log.report(Log.Error, "len of x_list: {} is not supported in subnormalize_multi", len(x_list)) raise NotImplementedError return [rounded_x_hi] + [ Constant(0, precision=precision) for i in range(len(x_list) - 1) ]
def Split(a): """... splitting algorithm for Dekker TwoMul""" # if a.get_precision() == ML_Binary32: s = Constant(4097, precision=a.get_precision(), tag='fp_split') # elif a.get_precision() == ML_Binary64: # s = Constant(134217729, precision = a.get_precision(), tag = 'fp_split') c = Multiplication(s, a) tmp = Subtraction(a, c) ah = Addition(tmp, c) al = Subtraction(a, ah) return ah, al
def Add222(xh, xl, yh, yl): """ Multi-precision Addition: HI, LO = [xh:xl] + [yh:yl] """ r = Addition(xh, yh) s1 = Subtraction(xh, r) s2 = Addition(s1, yh) s3 = Addition(s2, yl) s = Addition(s3, xl) zh = Addition(r, s) zl = Addition(Subtraction(r, zh), s) return zh, zl
def generate_twosum(vx, vy): """Return two optrees for a TwoSum operation. The return value is a tuple (sum, error). """ s = Addition(vx, vy) _x = Subtraction(s, vy) _y = Subtraction(s, _x) dx = Subtraction(vx, _x) dy = Subtraction(vy, _y) e = Addition(dx, dy) return s, e
def generate_twosum(vx, vy, precision=None): """Return two optrees for a TwoSum operation. The return value is a tuple (sum, error). """ s = Addition(vx, vy, precision=precision) _x = Subtraction(s, vy, precision=precision) _y = Subtraction(s, _x, precision=precision) dx = Subtraction(vx, _x, precision=precision) dy = Subtraction(vy, _y, precision=precision) e = Addition(dx, dy, precision=precision) return s, e
def Add222(xh, xl, yh, yl, precision=None): """ Multi-precision Addition: HI, LO = [xh:xl] + [yh:yl] """ r = Addition(xh, yh, precision=precision) s1 = Subtraction(xh, r, precision=precision) s2 = Addition(s1, yh, precision=precision) s3 = Addition(s2, yl, precision=precision) s = Addition(s3, xl, precision=precision) zh = Addition(r, s, precision=precision) zl = Addition(Subtraction(r, zh, precision=precision), s, precision=precision) return zh, zl
def generate_count_leading_zeros(vx): """Generate a vectorizable LZCNT optree.""" y = -BitLogicRightShift(vx, 16) # If left half of x is 0, m = BitLogicAnd(BitArithmeticRightShift(y, 16), 16) # set n = 16. If left half n = Subtraction(16, m) # is nonzero, set n = 0 and vx_2 = BitLogicRightShift(vx, m) # shift x right 16. # Now x is of the form 0000xxxx. y = vx_2 - 0x100 # If positions 8-15 are 0, m = BitLogicAnd(BitLogicRightShift(y, 16), 8) # add 8 to n and shift x left 8. n = n + m vx_3 = BitLogicLeftShift(vx_2, m) y = vx_3 - 0x1000 # If positions 12-15 are 0, m = BitLogicAnd(BitLogicRightShift(y, 16), 4) # add 4 to n and shift x left 4. n = n + m vx_4 = BitLogicLeftShift(vx_3, m) y = vx_4 - 0x4000 # If positions 14-15 are 0, m = BitLogicAnd(BitLogicRightShift(y, 16), 2) # add 2 to n and shift x left 2. n = n + m vx_5 = BitLogicLeftShift(vx_4, m) y = BitLogicRightShift(vx_5, 14) # Set y = 0, 1, 2, or 3. m = BitLogicAnd(y, BitLogicNegate(BitLogicRightShift( y, 1))) # Set m = 0, 1, 2, or 2 resp. return n + 2 - m
def generate_node_eval_error(optree, input_mapping, node_error_map, node_value_map): if optree in node_error_map or optree in input_mapping: return # placeholder to avoid diplicate complication node_error_map[optree] = None # recursive on node inputs if not is_leaf_node(optree): for op in optree.get_inputs(): generate_node_eval_error(op, input_mapping, node_error_map, node_value_map) expected_value = node_value_map[optree] assert expected_value != None expected_node = Constant(expected_value, precision=optree.get_precision()) precision = optree.get_precision() #error_node = Abs( # # FIXME/ may need to insert signed type if optree/expected_node are # # unsigned # Subtraction( # optree, # expected_node, # precision=precision), # precision=precision) error_node = Subtraction( optree, expected_node, precision=precision) error_display_statement = get_printf_value(optree, error_node, expected_node) node_error_map[optree] = error_display_statement
def Add212(xh, yh, yl, precision=None): """ Multi-precision Addition: HI, LO = xh + [yh:yl] """ # r = xh + yh # s1 = xh - r # s2 = s1 + yh # s = s2 + yl # zh = r + s # zl = (r - zh) + s r = Addition(xh, yh, precision=precision) s1 = Subtraction(xh, r, precision=precision) s2 = Addition(s1, yh, precision=precision) s = Addition(s2, yl, precision=precision) zh = Addition(r, s, precision=precision) zl = Addition(Subtraction(r, zh, precision=precision), s, precision=precision) return zh, zl
def Mul222(xh, xl, yh, yl): """ Multi-precision Multiplication: HI, LO = [xh:xl] * [yh:yl] """ ph = Multiplication(xh, yh) pl = FMS(xh, yh, ph) pl = FMA(xh, yl, pl) pl = FMA(xl, yh, pl) zh = Addition(ph, pl) zl = Subtraction(ph, zh) zl = Addition(zl, pl) return zh, zl
def Mul211(x, y, fma=True): """ Multi-precision Multiplication HI, LO = x * y """ zh = Multiplication(x, y) if fma == True: zl = FMS(x, y, zh) else: xh, xl = Split(x) yh, yl = Split(y) r1 = Multiplication(xh, yh) r2 = Subtraction(r1, zh) r3 = Multiplication(xh, yl) r4 = Multiplication(xl, yh) r5 = Multiplication(xl, yl) r6 = Addition(r2, r3) r7 = Addition(r6, r4) zl = Addition(r7, r5) return zh, zl
def Mul211(x, y, precision=None, fma=True): """ Multi-precision Multiplication HI, LO = x * y """ zh = Multiplication(x, y, precision=precision) if fma == True: zl = FMS(x, y, zh, precision=precision) else: xh, xl = Split(x, precision=precision) yh, yl = Split(y, precision=precision) r1 = Multiplication(xh, yh, precision=precision) r2 = Subtraction(r1, zh, precision=precision) r3 = Multiplication(xh, yl, precision=precision) r4 = Multiplication(xl, yh, precision=precision) r5 = Multiplication(xl, yl, precision=precision) r6 = Addition(r2, r3, precision=precision) r7 = Addition(r6, r4, precision=precision) zl = Addition(r7, r5, precision=precision) return zh, zl
def Mul222(xh, xl, yh, yl, fma=True): """ Multi-precision Multiplication: HI, LO = [xh:xl] * [yh:yl] """ if fma == True: ph = Multiplication(xh, yh) pl = FMS(xh, yh, ph) pl = FMA(xh, yl, pl) pl = FMA(xl, yh, pl) zh = Addition(ph, pl) zl = Subtraction(ph, zh) zl = Addition(zl, pl) else: t1, t2 = Mul211(xh, yh, fma) t3 = Multiplication(xh, yl) t4 = Multiplication(xl, yh) t5 = Addition(t3, t4) t6 = Addition(t2, t5) zh, zl = Add211(t1, t6) return zh, zl
def get_index_node(self, vx): assert vx.precision is self.precision int_precision = vx.precision.get_integer_format() index_size = self.exp_bits + self.field_bits # building an index mask from the index_size index_mask = Constant(2**index_size - 1, precision=int_precision) shift_amount = Constant(vx.get_precision().get_field_size() - self.field_bits, precision=int_precision) exp_offset = Constant(self.precision.get_integer_coding( S2**self.low_exp_value), precision=int_precision) return BitLogicAnd(BitLogicRightShift(Subtraction( TypeCast(vx, precision=int_precision), exp_offset, precision=int_precision), shift_amount, precision=int_precision), index_mask, precision=int_precision)
def generic_poly_split(offset_fct, indexing, target_eps, coeff_precision, vx): """ generate the meta approximation for @p offset_fct over several intervals defined by @p indexing object For each sub-interval, a polynomial approximation with maximal_error @p target_eps is tabulated, and evaluated using format @p coeff_precision. The input variable is @p vx """ # computing degree for a different polynomial approximation on each # sub-interval poly_degree_list = [ int(sup(guessdegree(offset_fct(offset), sub_interval, target_eps))) for offset, sub_interval in indexing.get_offseted_sub_list() ] poly_max_degree = max(poly_degree_list) # tabulating polynomial coefficients on split_num sub-interval of interval poly_table = ML_NewTable( dimensions=[indexing.split_num, poly_max_degree + 1], storage_precision=coeff_precision, const=True) offset_table = ML_NewTable(dimensions=[indexing.split_num], storage_precision=coeff_precision, const=True) max_error = 0.0 for sub_index in range(indexing.split_num): poly_degree = poly_degree_list[sub_index] offset, approx_interval = indexing.get_offseted_sub_interval(sub_index) offset_table[sub_index] = offset if poly_degree == 0: # managing constant approximation separately since it seems # to break sollya local_approx = coeff_precision.round_sollya_object( offset_fct(offset)(inf(approx_interval))) poly_table[sub_index][0] = local_approx for monomial_index in range(1, poly_max_degree + 1): poly_table[sub_index][monomial_index] = 0 approx_error = sollya.infnorm( offset_fct(offset) - local_approx, approx_interval) else: poly_object, approx_error = Polynomial.build_from_approximation_with_error( offset_fct(offset), poly_degree, [coeff_precision] * (poly_degree + 1), approx_interval, sollya.relative) for monomial_index in range(poly_max_degree + 1): if monomial_index <= poly_degree: poly_table[sub_index][ monomial_index] = poly_object.coeff_map[monomial_index] else: poly_table[sub_index][monomial_index] = 0 max_error = max(approx_error, max_error) Log.report(Log.Debug, "max approx error is {}", max_error) # indexing function: derive index from input @p vx value poly_index = indexing.get_index_node(vx) poly_index.set_attributes(tag="poly_index", debug=debug_multi) ext_precision = get_extended_fp_precision(coeff_precision) # building polynomial evaluation scheme offset = TableLoad(offset_table, poly_index, precision=coeff_precision, tag="offset", debug=debug_multi) poly = TableLoad(poly_table, poly_index, poly_max_degree, precision=coeff_precision, tag="poly_init", debug=debug_multi) red_vx = Subtraction(vx, offset, precision=vx.precision, tag="red_vx", debug=debug_multi) for monomial_index in range(poly_max_degree, -1, -1): coeff = TableLoad(poly_table, poly_index, monomial_index, precision=coeff_precision, tag="poly_%d" % monomial_index, debug=debug_multi) #fma_precision = coeff_precision if monomial_index > 1 else ext_precision fma_precision = coeff_precision poly = FMA(red_vx, poly, coeff, precision=fma_precision) #return Conversion(poly, precision=coeff_precision) #return poly.hi return poly
def generate_bench_wrapper(self, test_num=1, loop_num=100000, test_ranges=[Interval(-1.0, 1.0)], debug=False): # interval where the array lenght is chosen from (randomly) index_range = self.test_index_range auto_test = CodeFunction("bench_wrapper", output_format=ML_Binary64) tested_function = self.implementation.get_function_object() function_name = self.implementation.get_name() failure_report_op = FunctionOperator("report_failure") failure_report_function = FunctionObject("report_failure", [], ML_Void, failure_report_op) printf_success_op = FunctionOperator( "printf", arg_map={0: "\"test successful %s\\n\"" % function_name}, void_function=True) printf_success_function = FunctionObject("printf", [], ML_Void, printf_success_op) output_precision = FormatAttributeWrapper(self.precision, ["volatile"]) test_total = test_num # number of arrays expected as inputs for tested_function NUM_INPUT_ARRAY = 1 # position of the input array in tested_function operands (generally # equals to 1 as to 0-th input is often the destination array) INPUT_INDEX_OFFSET = 1 # concatenating standard test array at the beginning of randomly # generated array TABLE_SIZE_VALUES = [ len(std_table) for std_table in self.standard_test_cases ] + [ random.randrange(index_range[0], index_range[1] + 1) for i in range(test_num) ] OFFSET_VALUES = [sum(TABLE_SIZE_VALUES[:i]) for i in range(test_total)] table_size_offset_array = generate_2d_table( test_total, 2, ML_UInt32, self.uniquify_name("table_size_array"), value_gen=(lambda row_id: (TABLE_SIZE_VALUES[row_id], OFFSET_VALUES[row_id]))) INPUT_ARRAY_SIZE = sum(TABLE_SIZE_VALUES) # TODO/FIXME: implement proper input range depending on input index # assuming a single input array input_precisions = [self.get_input_precision(1).get_data_precision()] rng_map = [ get_precision_rng(precision, inf(test_range), sup(test_range)) for precision, test_range in zip(input_precisions, test_ranges) ] # generated table of inputs input_tables = [ generate_1d_table( INPUT_ARRAY_SIZE, self.get_input_precision(INPUT_INDEX_OFFSET + table_id).get_data_precision(), self.uniquify_name("input_table_arg%d" % table_id), value_gen=( lambda _: input_precisions[table_id].round_sollya_object( rng_map[table_id].get_new_value(), sollya.RN))) for table_id in range(NUM_INPUT_ARRAY) ] # generate output_array output_array = generate_1d_table( INPUT_ARRAY_SIZE, output_precision, self.uniquify_name("output_array"), #value_gen=(lambda _: FP_QNaN(self.precision)) value_gen=(lambda _: None), const=False, empty=True) # accumulate element number acc_num = Variable("acc_num", precision=ML_Int64, var_type=Variable.Local) def empty_post_statement_gen(input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id): return Statement() test_loop = self.get_array_test_wrapper(test_total, tested_function, table_size_offset_array, input_tables, output_array, acc_num, empty_post_statement_gen) timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local) printf_timing_op = FunctionOperator( "printf", arg_map={ 0: "\"%s %%\"PRIi64\" elts computed in %%\"PRIi64\" nanoseconds => %%.3f CPE \\n\"" % function_name, 1: FO_Arg(0), 2: FO_Arg(1), 3: FO_Arg(2) }, void_function=True) printf_timing_function = FunctionObject( "printf", [ML_Int64, ML_Int64, ML_Binary64], ML_Void, printf_timing_op) vj = Variable("j", precision=ML_Int32, var_type=Variable.Local) loop_num_cst = Constant(loop_num, precision=ML_Int32, tag="loop_num") loop_increment = 1 # bench measure of clock per element cpe_measure = Division( Conversion(timer, precision=ML_Binary64), Conversion(acc_num, precision=ML_Binary64), precision=ML_Binary64, tag="cpe_measure", ) # common test scheme between scalar and vector functions test_scheme = Statement( self.processor.get_init_timestamp(), ReferenceAssign(timer, self.processor.get_current_timestamp()), ReferenceAssign(acc_num, 0), Loop( ReferenceAssign(vj, Constant(0, precision=ML_Int32)), vj < loop_num_cst, Statement(test_loop, ReferenceAssign(vj, vj + loop_increment))), ReferenceAssign( timer, Subtraction(self.processor.get_current_timestamp(), timer, precision=ML_Int64)), printf_timing_function( Conversion(acc_num, precision=ML_Int64), timer, cpe_measure, ), Return(cpe_measure), # Return(Constant(0, precision = ML_Int32)) ) auto_test.set_scheme(test_scheme) return FunctionGroup([auto_test])
def generate_bench(self, processor, test_num=1000, unroll_factor=10): """ generate performance bench for self.op_class """ initial_inputs = [ Constant(random.uniform(inf(self.init_interval), sup(self.init_interval)), precision=precision) for i, precision in enumerate(self.input_precisions) ] var_inputs = [ Variable("var_%d" % i, precision=FormatAttributeWrapper(precision, ["volatile"]), var_type=Variable.Local) for i, precision in enumerate(self.input_precisions) ] printf_timing_op = FunctionOperator( "printf", arg_map={ 0: "\"%s[%s] %%lld elts computed "\ "in %%lld cycles =>\\n %%.3f CPE \\n\"" % ( self.bench_name, self.output_precision.get_display_format() ), 1: FO_Arg(0), 2: FO_Arg(1), 3: FO_Arg(2), 4: FO_Arg(3) }, void_function=True ) printf_timing_function = FunctionObject( "printf", [self.output_precision, ML_Int64, ML_Int64, ML_Binary64], ML_Void, printf_timing_op) timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local) void_function_op = FunctionOperator("(void)", arity=1, void_function=True) void_function = FunctionObject("(void)", [self.output_precision], ML_Void, void_function_op) # initialization of operation inputs init_assign = metaop.Statement() for var_input, init_value in zip(var_inputs, initial_inputs): init_assign.push(ReferenceAssign(var_input, init_value)) # test loop loop_i = Variable("i", precision=ML_Int64, var_type=Variable.Local) test_num_cst = Constant(test_num / unroll_factor, precision=ML_Int64, tag="test_num") # Goal build a chain of dependant operation to measure # elementary operation latency local_inputs = tuple(var_inputs) local_result = self.op_class(*local_inputs, precision=self.output_precision, unbreakable=True) for i in range(unroll_factor - 1): local_inputs = tuple([local_result] + var_inputs[1:]) local_result = self.op_class(*local_inputs, precision=self.output_precision, unbreakable=True) # renormalisation local_result = self.renorm_function(local_result) # variable assignation to build dependency chain var_assign = Statement() var_assign.push(ReferenceAssign(var_inputs[0], local_result)) final_value = var_inputs[0] # loop increment value loop_increment = 1 test_loop = Loop( ReferenceAssign(loop_i, Constant(0, precision=ML_Int32)), loop_i < test_num_cst, Statement(var_assign, ReferenceAssign(loop_i, loop_i + loop_increment)), ) # bench scheme test_scheme = Statement( ReferenceAssign(timer, processor.get_current_timestamp()), init_assign, test_loop, ReferenceAssign( timer, Subtraction(processor.get_current_timestamp(), timer, precision=ML_Int64)), # prevent intermediary variable simplification void_function(final_value), printf_timing_function( final_value, Constant(test_num, precision=ML_Int64), timer, Division(Conversion(timer, precision=ML_Binary64), Constant(test_num, precision=ML_Binary64), precision=ML_Binary64)) # ,Return(Constant(0, precision = ML_Int32)) ) return test_scheme
def generate_cos_scheme(self, computation_precision, tabulated_cos, tabulated_sin, sin_C2, cos_C2, red_vx_lo): cos_C2 = Multiplication(tabulated_cos, cos_C2, precision=ML_Custom_FixedPoint_Format( -1, 32, signed=True), tag="cos_C2") u2 = Multiplication( red_vx_lo, red_vx_lo, precision= computation_precision, # ML_Custom_FixedPoint_Format(5, 26, signed = True) tag="u2") sin_u = Multiplication( tabulated_sin, red_vx_lo, precision= computation_precision, # ML_Custom_FixedPoint_Format(1, 30, signed = True) tag="sin_u") cos_C2_u2 = Multiplication( cos_C2, u2, precision= computation_precision, # ML_Custom_FixedPoint_Format(1, 30,signed = True) tag="cos_C2_u2") S2_u2 = Multiplication(sin_C2, u2, precision=ML_Custom_FixedPoint_Format( -1, 32, signed=True), tag="S2_u2") S2_u3_sin = Multiplication( S2_u2, sin_u, precision= computation_precision, # ML_Custom_FixedPoint_Format(5,26, signed = True) tag="S2_u3_sin") cos_C2_u2_P_cos = Addition( tabulated_cos, cos_C2_u2, precision= computation_precision, # ML_Custom_FixedPoint_Format(5, 26, signed = True) tag="cos_C2_u2_P_cos") cos_C2_u2_P_cos_M_sin_u = Subtraction( cos_C2_u2_P_cos, sin_u, precision= computation_precision # ML_Custom_FixedPoint_Format(5, 26, signed = True) ) scheme = Subtraction( cos_C2_u2_P_cos_M_sin_u, S2_u3_sin, precision= computation_precision # ML_Custom_FixedPoint_Format(5, 26, signed = True) ) return scheme
def generate_scheme(self): # declaring function input variable v_x = [ self.implementation.add_input_variable( "x%d" % index, self.get_input_precision(index)) for index in range(self.arity) ] double_format = { ML_Binary32: ML_SingleSingle, ML_Binary64: ML_DoubleDouble }[self.precision] # testing Add211 exact_add = Addition(v_x[0], v_x[1], precision=double_format, tag="exact_add") # testing Mul211 exact_mul = Multiplication(v_x[0], v_x[1], precision=double_format, tag="exact_mul") # testing Sub211 exact_sub = Subtraction(v_x[1], v_x[0], precision=double_format, tag="exact_sub") # testing Add222 multi_add = Addition(exact_add, exact_sub, precision=double_format, tag="multi_add") # testing Mul222 multi_mul = Multiplication(multi_add, exact_mul, precision=double_format, tag="multi_mul") # testing Add221 and Add212 and Sub222 multi_sub = Subtraction(Addition(exact_sub, v_x[1], precision=double_format, tag="add221"), Addition(v_x[0], multi_mul, precision=double_format, tag="add212"), precision=double_format, tag="sub222") # testing Mul212 and Mul221 mul212 = Multiplication(multi_sub, v_x[0], precision=double_format, tag="mul212") mul221 = Multiplication(exact_mul, v_x[1], precision=double_format, tag="mul221") # testing Sub221 and Sub212 sub221 = Subtraction(mul212, mul221.hi, precision=double_format, tag="sub221") sub212 = Subtraction(sub221, mul212.lo, precision=double_format, tag="sub212") # testing FMA2111 fma2111 = FMA(sub221.lo, sub212.hi, mul221.hi, precision=double_format, tag="fma2111") # testing FMA2112 fma2112 = FMA(fma2111.lo, fma2111.hi, fma2111, precision=double_format, tag="fma2112") # testing FMA2212 fma2212 = FMA(fma2112, fma2112.hi, fma2112, precision=double_format, tag="fma2212") # testing FMA2122 fma2122 = FMA(fma2212.lo, fma2212, fma2212, precision=double_format, tag="fma2122") # testing FMA22222 fma2222 = FMA(fma2122, fma2212, fma2111, precision=double_format, tag="fma2222") # testing Add122 add122 = Addition(fma2222, fma2222, precision=self.precision, tag="add122") # testing Add112 add112 = Addition(add122, fma2222, precision=self.precision, tag="add112") # testing Add121 add121 = Addition(fma2222, add112, precision=self.precision, tag="add121") # testing subnormalization multi_subnormalize = SpecificOperation( Addition(add121, add112, precision=double_format), Constant(3, precision=self.precision.get_integer_format()), specifier=SpecificOperation.Subnormalize, precision=double_format, tag="multi_subnormalize") result = Conversion(multi_subnormalize, precision=self.precision) scheme = Statement(Return(result)) return scheme
def piecewise_approximation(function, variable, precision, bound_low=-1.0, bound_high=1.0, num_intervals=16, max_degree=2, error_threshold=S2**-24, odd=False, even=False): """ Generate a piecewise approximation :param function: function to be approximated :type function: SollyaObject :param variable: input variable :type variable: Variable :param precision: variable's format :type precision: ML_Format :param bound_low: lower bound for the approximation interval :param bound_high: upper bound for the approximation interval :param num_intervals: number of sub-interval / sub-division of the main interval :param max_degree: maximum degree for an approximation on any sub-interval :param error_threshold: error bound for an approximation on any sub-interval :return: pair (scheme, error) where scheme is a graph node for an approximation scheme of function evaluated at variable, and error is the maximum approximation error encountered :rtype tuple(ML_Operation, SollyaObject): """ degree_generator = piecewise_approximation_degree_generator( function, bound_low, bound_high, num_intervals=num_intervals, error_threshold=error_threshold, ) degree_list = list(degree_generator) # if max_degree is None then we determine it locally if max_degree is None: max_degree = max(degree_list) # table to store coefficients of the approximation on each segment coeff_table = ML_NewTable( dimensions=[num_intervals, max_degree + 1], storage_precision=precision, tag="coeff_table", const=True # by default all approximation coeff table are const ) error_function = lambda p, f, ai, mod, t: sollya.dirtyinfnorm(p - f, ai) max_approx_error = 0.0 interval_size = (bound_high - bound_low) / num_intervals for i in range(num_intervals): subint_low = bound_low + i * interval_size subint_high = bound_low + (i + 1) * interval_size local_function = function(sollya.x + subint_low) local_interval = Interval(-interval_size, interval_size) local_degree = degree_list[i] if local_degree > max_degree: Log.report( Log.Warning, "local_degree {} exceeds max_degree bound ({}) in piecewise_approximation", local_degree, max_degree) # as max_degree defines the size of the table we can use # it as the degree for each sub-interval polynomial # as there is nothing to gain (yet) by using a smaller polynomial degree = max_degree # min(max_degree, local_degree) if function(subint_low) == 0.0: # if the lower bound is a zero to the function, we # need to force value=0 for the constant coefficient # and extend the approximation interval local_poly_degree_list = list( range(1 if even else 0, degree + 1, 2 if odd or even else 1)) poly_object, approx_error = Polynomial.build_from_approximation_with_error( function(sollya.x) / sollya.x, local_poly_degree_list, [precision] * len(local_poly_degree_list), Interval(-subint_high * 0.95, subint_high), sollya.absolute, error_function=error_function) # multiply by sollya.x poly_object = poly_object.sub_poly(offset=-1) else: try: poly_object, approx_error = Polynomial.build_from_approximation_with_error( local_function, degree, [precision] * (degree + 1), local_interval, sollya.absolute, error_function=error_function) except SollyaError as err: # try to see if function is constant on the interval (possible # failure cause for fpminmax) cst_value = precision.round_sollya_object( function(subint_low), sollya.RN) accuracy = error_threshold diff_with_cst_range = sollya.supnorm(cst_value, local_function, local_interval, sollya.absolute, accuracy) diff_with_cst = sup(abs(diff_with_cst_range)) if diff_with_cst < error_threshold: Log.report(Log.Info, "constant polynomial detected") poly_object = Polynomial([function(subint_low)] + [0] * degree) approx_error = diff_with_cst else: Log.report( Log.error, "degree: {} for index {}, diff_with_cst={} (vs error_threshold={}) ", degree, i, diff_with_cst, error_threshold, error=err) for ci in range(max_degree + 1): if ci in poly_object.coeff_map: coeff_table[i][ci] = poly_object.coeff_map[ci] else: coeff_table[i][ci] = 0.0 if approx_error > error_threshold: Log.report( Log.Warning, "piecewise_approximation on index {} exceeds error threshold: {} > {}", i, approx_error, error_threshold) max_approx_error = max(max_approx_error, abs(approx_error)) # computing offset diff = Subtraction(variable, Constant(bound_low, precision=precision), tag="diff", debug=debug_multi, precision=precision) int_prec = precision.get_integer_format() # delta = bound_high - bound_low delta_ratio = Constant(num_intervals / (bound_high - bound_low), precision=precision) # computing table index # index = nearestint(diff / delta * <num_intervals>) index = Max(0, Min( NearestInteger( Multiplication(diff, delta_ratio, precision=precision), precision=int_prec, ), num_intervals - 1), tag="index", debug=debug_multi, precision=int_prec) poly_var = Subtraction(diff, Multiplication( Conversion(index, precision=precision), Constant(interval_size, precision=precision)), precision=precision, tag="poly_var", debug=debug_multi) # generating indexed polynomial coeffs = [(ci, TableLoad(coeff_table, index, ci)) for ci in range(max_degree + 1)][::-1] poly_scheme = PolynomialSchemeEvaluator.generate_horner_scheme2( coeffs, poly_var, precision, {}, precision) return poly_scheme, max_approx_error
def generate_scheme(self): # declaring target and instantiating optimization engine vx = self.implementation.add_input_variable("x", self.precision) Log.set_dump_stdout(True) Log.report(Log.Info, "\033[33;1m generating implementation scheme \033[0m") if self.debug_flag: Log.report(Log.Info, "\033[31;1m debug has been enabled \033[0;m") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name if self.libm_compliant: return RaiseReturn(*args, precision=self.precision, **kwords) else: return Return(kwords["return_value"], precision=self.precision) test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=debug_multi, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=debug_multi, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=debug_multi, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=debug_multi, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock( test_positive, Return(FP_PlusInfty(self.precision), precision=self.precision), Return(FP_PlusZero(self.precision), precision=self.precision))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock( test_signaling_nan, return_snan, Return(FP_QNaN(self.precision), precision=self.precision)), infty_return) # return in case of standard (non-special) input # exclusion of early overflow and underflow cases precision_emax = self.precision.get_emax() precision_max_value = S2 * S2**precision_emax exp_overflow_bound = sollya.ceil(log(precision_max_value)) early_overflow_test = Comparison(vx, exp_overflow_bound, likely=False, specifier=Comparison.Greater) early_overflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision))) precision_emin = self.precision.get_emin_subnormal() precision_min_value = S2**precision_emin exp_underflow_bound = floor(log(precision_min_value)) early_underflow_test = Comparison(vx, exp_underflow_bound, likely=False, specifier=Comparison.Less) early_underflow_return = Statement( ClearException() if self.libm_compliant else Statement(), ExpRaiseReturn(ML_FPE_Inexact, ML_FPE_Underflow, return_value=FP_PlusZero(self.precision))) # constant computation invlog2 = self.precision.round_sollya_object(1 / log(2), sollya.RN) interval_vx = Interval(exp_underflow_bound, exp_overflow_bound) interval_fk = interval_vx * invlog2 interval_k = Interval(floor(inf(interval_fk)), sollya.ceil(sup(interval_fk))) log2_hi_precision = self.precision.get_field_size() - ( sollya.ceil(log2(sup(abs(interval_k)))) + 2) Log.report(Log.Info, "log2_hi_precision: %d" % log2_hi_precision) invlog2_cst = Constant(invlog2, precision=self.precision) log2_hi = round(log(2), log2_hi_precision, sollya.RN) log2_lo = self.precision.round_sollya_object( log(2) - log2_hi, sollya.RN) # argument reduction unround_k = vx * invlog2 unround_k.set_attributes(tag="unround_k", debug=debug_multi) k = NearestInteger(unround_k, precision=self.precision, debug=debug_multi) ik = NearestInteger(unround_k, precision=self.precision.get_integer_format(), debug=debug_multi, tag="ik") ik.set_tag("ik") k.set_tag("k") exact_pre_mul = (k * log2_hi) exact_pre_mul.set_attributes(exact=True) exact_hi_part = vx - exact_pre_mul exact_hi_part.set_attributes(exact=True, tag="exact_hi", debug=debug_multi, prevent_optimization=True) exact_lo_part = -k * log2_lo exact_lo_part.set_attributes(tag="exact_lo", debug=debug_multi, prevent_optimization=True) r = exact_hi_part + exact_lo_part r.set_tag("r") r.set_attributes(debug=debug_multi) approx_interval = Interval(-log(2) / 2, log(2) / 2) approx_interval_half = approx_interval / 2 approx_interval_split = [ Interval(-log(2) / 2, inf(approx_interval_half)), approx_interval_half, Interval(sup(approx_interval_half), log(2) / 2) ] # TODO: should be computed automatically exact_hi_interval = approx_interval exact_lo_interval = -interval_k * log2_lo opt_r = self.optimise_scheme(r, copy={}) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_r, tag_map) cg_eval_error_copy_map = { vx: Variable("x", precision=self.precision, interval=interval_vx), tag_map["k"]: Variable("k", interval=interval_k, precision=self.precision) } #try: if is_gappa_installed(): eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_r, cg_eval_error_copy_map, gappa_filename="red_arg.g") else: eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "eval error: %s" % eval_error) local_ulp = sup(ulp(sollya.exp(approx_interval), self.precision)) # FIXME refactor error_goal from accuracy Log.report(Log.Info, "accuracy: %s" % self.accuracy) if isinstance(self.accuracy, ML_Faithful): error_goal = local_ulp elif isinstance(self.accuracy, ML_CorrectlyRounded): error_goal = S2**-1 * local_ulp elif isinstance(self.accuracy, ML_DegradedAccuracyAbsolute): error_goal = self.accuracy.goal elif isinstance(self.accuracy, ML_DegradedAccuracyRelative): error_goal = self.accuracy.goal else: Log.report(Log.Error, "unknown accuracy: %s" % self.accuracy) # error_goal = local_ulp #S2**-(self.precision.get_field_size()+1) error_goal_approx = S2**-1 * error_goal Log.report(Log.Info, "\033[33;1m building mathematical polynomial \033[0m\n") poly_degree = max( sup( guessdegree( expm1(sollya.x) / sollya.x, approx_interval, error_goal_approx)) - 1, 2) init_poly_degree = poly_degree error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme while 1: Log.report(Log.Info, "attempting poly degree: %d" % poly_degree) precision_list = [1] + [self.precision] * (poly_degree) poly_object, poly_approx_error = Polynomial.build_from_approximation_with_error( expm1(sollya.x), poly_degree, precision_list, approx_interval, sollya.absolute, error_function=error_function) Log.report(Log.Info, "polynomial: %s " % poly_object) sub_poly = poly_object.sub_poly(start_index=2) Log.report(Log.Info, "polynomial: %s " % sub_poly) Log.report(Log.Info, "poly approx error: %s" % poly_approx_error) Log.report( Log.Info, "\033[33;1m generating polynomial evaluation scheme \033[0m") pre_poly = polynomial_scheme_builder( poly_object, r, unified_precision=self.precision) pre_poly.set_attributes(tag="pre_poly", debug=debug_multi) pre_sub_poly = polynomial_scheme_builder( sub_poly, r, unified_precision=self.precision) pre_sub_poly.set_attributes(tag="pre_sub_poly", debug=debug_multi) poly = 1 + (exact_hi_part + (exact_lo_part + pre_sub_poly)) poly.set_tag("poly") # optimizing poly before evaluation error computation #opt_poly = self.opt_engine.optimization_process(poly, self.precision, fuse_fma = fuse_fma) #opt_sub_poly = self.opt_engine.optimization_process(pre_sub_poly, self.precision, fuse_fma = fuse_fma) opt_poly = self.optimise_scheme(poly) opt_sub_poly = self.optimise_scheme(pre_sub_poly) # evaluating error of the polynomial approximation r_gappa_var = Variable("r", precision=self.precision, interval=approx_interval) exact_hi_gappa_var = Variable("exact_hi", precision=self.precision, interval=exact_hi_interval) exact_lo_gappa_var = Variable("exact_lo", precision=self.precision, interval=exact_lo_interval) vx_gappa_var = Variable("x", precision=self.precision, interval=interval_vx) k_gappa_var = Variable("k", interval=interval_k, precision=self.precision) #print "exact_hi interval: ", exact_hi_interval sub_poly_error_copy_map = { #r.get_handle().get_node(): r_gappa_var, #vx.get_handle().get_node(): vx_gappa_var, exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, #k.get_handle().get_node(): k_gappa_var, } poly_error_copy_map = { exact_hi_part.get_handle().get_node(): exact_hi_gappa_var, exact_lo_part.get_handle().get_node(): exact_lo_gappa_var, } if is_gappa_installed(): sub_poly_eval_error = -1.0 sub_poly_eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_sub_poly, sub_poly_error_copy_map, gappa_filename="%s_gappa_sub_poly.g" % self.function_name) dichotomy_map = [ { exact_hi_part.get_handle().get_node(): approx_interval_split[0], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[1], }, { exact_hi_part.get_handle().get_node(): approx_interval_split[2], }, ] poly_eval_error_dico = self.gappa_engine.get_eval_error_v3( self.opt_engine, opt_poly, poly_error_copy_map, gappa_filename="gappa_poly.g", dichotomy=dichotomy_map) poly_eval_error = max( [sup(abs(err)) for err in poly_eval_error_dico]) else: poly_eval_error = 0.0 sub_poly_eval_error = 0.0 Log.report(Log.Warning, "gappa is not installed in this environnement") Log.report(Log.Info, "stopping autonomous degree research") # incrementing polynomial degree to counteract initial decrementation effect poly_degree += 1 break Log.report(Log.Info, "poly evaluation error: %s" % poly_eval_error) Log.report(Log.Info, "sub poly evaluation error: %s" % sub_poly_eval_error) global_poly_error = None global_rel_poly_error = None for case_index in range(3): poly_error = poly_approx_error + poly_eval_error_dico[ case_index] rel_poly_error = sup( abs(poly_error / sollya.exp(approx_interval_split[case_index]))) if global_rel_poly_error == None or rel_poly_error > global_rel_poly_error: global_rel_poly_error = rel_poly_error global_poly_error = poly_error flag = error_goal > global_rel_poly_error if flag: break else: poly_degree += 1 late_overflow_test = Comparison(ik, self.precision.get_emax(), specifier=Comparison.Greater, likely=False, debug=debug_multi, tag="late_overflow_test") overflow_exp_offset = (self.precision.get_emax() - self.precision.get_field_size() / 2) diff_k = Subtraction( ik, Constant(overflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), debug=debug_multi, tag="diff_k", ) late_overflow_result = (ExponentInsertion( diff_k, precision=self.precision) * poly) * ExponentInsertion( overflow_exp_offset, precision=self.precision) late_overflow_result.set_attributes(silent=False, tag="late_overflow_result", debug=debug_multi, precision=self.precision) late_overflow_return = ConditionBlock( Test(late_overflow_result, specifier=Test.IsInfty, likely=False), ExpRaiseReturn(ML_FPE_Overflow, return_value=FP_PlusInfty(self.precision)), Return(late_overflow_result, precision=self.precision)) late_underflow_test = Comparison(k, self.precision.get_emin_normal(), specifier=Comparison.LessOrEqual, likely=False) underflow_exp_offset = 2 * self.precision.get_field_size() corrected_exp = Addition( ik, Constant(underflow_exp_offset, precision=self.precision.get_integer_format()), precision=self.precision.get_integer_format(), tag="corrected_exp") late_underflow_result = ( ExponentInsertion(corrected_exp, precision=self.precision) * poly) * ExponentInsertion(-underflow_exp_offset, precision=self.precision) late_underflow_result.set_attributes(debug=debug_multi, tag="late_underflow_result", silent=False) test_subnormal = Test(late_underflow_result, specifier=Test.IsSubnormal) late_underflow_return = Statement( ConditionBlock( test_subnormal, ExpRaiseReturn(ML_FPE_Underflow, return_value=late_underflow_result)), Return(late_underflow_result, precision=self.precision)) twok = ExponentInsertion(ik, tag="exp_ik", debug=debug_multi, precision=self.precision) #std_result = twok * ((1 + exact_hi_part * pre_poly) + exact_lo_part * pre_poly) std_result = twok * poly std_result.set_attributes(tag="std_result", debug=debug_multi) result_scheme = ConditionBlock( late_overflow_test, late_overflow_return, ConditionBlock(late_underflow_test, late_underflow_return, Return(std_result, precision=self.precision))) std_return = ConditionBlock( early_overflow_test, early_overflow_return, ConditionBlock(early_underflow_test, early_underflow_return, result_scheme)) # main scheme Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") scheme = ConditionBlock( test_nan_or_inf, Statement(ClearException() if self.libm_compliant else Statement(), specific_return), std_return) return scheme
def piecewise_approximation(function, variable, precision, bound_low=-1.0, bound_high=1.0, num_intervals=16, max_degree=2, error_threshold=sollya.S2**-24): """ To be documented """ # table to store coefficients of the approximation on each segment coeff_table = ML_NewTable(dimensions=[num_intervals, max_degree + 1], storage_precision=precision, tag="coeff_table") error_function = lambda p, f, ai, mod, t: sollya.dirtyinfnorm(p - f, ai) max_approx_error = 0.0 interval_size = (bound_high - bound_low) / num_intervals for i in range(num_intervals): subint_low = bound_low + i * interval_size subint_high = bound_low + (i + 1) * interval_size #local_function = function(sollya.x) #local_interval = Interval(subint_low, subint_high) local_function = function(sollya.x + subint_low) local_interval = Interval(-interval_size, interval_size) local_degree = sollya.guessdegree(local_function, local_interval, error_threshold) degree = min(max_degree, local_degree) if function(subint_low) == 0.0: # if the lower bound is a zero to the function, we # need to force value=0 for the constant coefficient # and extend the approximation interval degree_list = range(1, degree + 1) poly_object, approx_error = Polynomial.build_from_approximation_with_error( function(sollya.x), degree_list, [precision] * len(degree_list), Interval(-subint_high, subint_high), sollya.absolute, error_function=error_function) else: try: poly_object, approx_error = Polynomial.build_from_approximation_with_error( local_function, degree, [precision] * (degree + 1), local_interval, sollya.absolute, error_function=error_function) except SollyaError as err: print("degree: {}".format(degree)) raise err for ci in range(degree + 1): if ci in poly_object.coeff_map: coeff_table[i][ci] = poly_object.coeff_map[ci] else: coeff_table[i][ci] = 0.0 max_approx_error = max(max_approx_error, abs(approx_error)) # computing offset diff = Subtraction(variable, Constant(bound_low, precision=precision), tag="diff", precision=precision) # delta = bound_high - bound_low delta_ratio = Constant(num_intervals / (bound_high - bound_low), precision=precision) # computing table index # index = nearestint(diff / delta * <num_intervals>) index = Max(0, Min( NearestInteger(Multiplication(diff, delta_ratio, precision=precision), precision=ML_Int32), num_intervals - 1), tag="index", debug=True, precision=ML_Int32) poly_var = Subtraction(diff, Multiplication( Conversion(index, precision=precision), Constant(interval_size, precision=precision)), precision=precision, tag="poly_var", debug=True) # generating indexed polynomial coeffs = [(ci, TableLoad(coeff_table, index, ci)) for ci in range(degree + 1)][::-1] poly_scheme = PolynomialSchemeEvaluator.generate_horner_scheme2( coeffs, poly_var, precision, {}, precision) return poly_scheme, max_approx_error