def generate_emulate(self, result, mpfr_x, mpfr_rnd): """ generate the emulation code for ML_Log2 functions mpfr_x is a mpfr_t variable which should have the right precision mpfr_rnd is the rounding mode """ emulate_func_name = "mpfr_cos" emulate_func_op = FunctionOperator(emulate_func_name, arg_map={ 0: FO_Result(0), 1: FO_Arg(0), 2: FO_Arg(1) }, require_header=["mpfr.h"]) emulate_func = FunctionObject(emulate_func_name, [ML_Mpfr_t, ML_Int32], ML_Mpfr_t, emulate_func_op) mpfr_call = Statement( ReferenceAssign(result, emulate_func(mpfr_x, mpfr_rnd))) return mpfr_call
def __init__(self, pattern, arity=1, exu=None, **kw): TemplateOperatorFormat.__init__( self, pattern, arg_map=({ index: arg_obj for (index, arg_obj) in [(0, FO_Result())] + [(i + 1, FO_Arg(i)) for i in range(arity)] }), *kw) MachineInstruction.__init__(self, exu)
def generate_emulate(self, result, mpfr_x, mpfr_rnd): """ generate the emulation code for ML_Log2 functions mpfr_x is a mpfr_t variable which should have the right precision mpfr_rnd is the rounding mode """ #mpfr_x = emulate_implementation.add_input_variable("x", ML_Mpfr_t) #mpfr_rnd = emulate_implementation.add_input_variable("rnd", ML_Int32) emulate_func_name = "mpfr_log10" emulate_func_op = FunctionOperator(emulate_func_name, arg_map={ 0: FO_Result(0), 1: FO_Arg(0), 2: FO_Arg(1) }, require_header=["mpfr.h"]) emulate_func = FunctionObject(emulate_func_name, [ML_Mpfr_t, ML_Int32], ML_Mpfr_t, emulate_func_op) #emulate_func_op.declare_prototype = emulate_func mpfr_call = Statement( ReferenceAssign(result, emulate_func(mpfr_x, mpfr_rnd))) return mpfr_call
def generate_scheme(self): #func_implementation = CodeFunction(self.function_name, output_format = self.precision) vx = self.implementation.add_input_variable("x", self.get_input_precision()) mpfr_x = Conversion(vx, precision=ML_Mpfr_t) emulate_func_name = "mpfr_exp" emulate_func_op = FunctionOperator(emulate_func_name, arg_map={ 0: FO_Result(0), 1: FO_Arg(0) }, require_header=["mpfr.h"]) emulate_func = FunctionObject(emulate_func_name, [ML_Mpfr_t], ML_Mpfr_t, emulate_func_op) emulate_func_op.declare_prototype = emulate_func mpfr_call = Conversion(emulate_func(mpfr_x), precision=self.precision) scheme = Statement(Return(mpfr_call)) return scheme
#switch_map[sub_half] = Return(-poly_scheme_vector[sub_half]) #switch_map[half + sub_half] = Return(poly_scheme_vector[sub_half]) switch_map[(sub_half, half + sub_half)] = Return(factor2 * poly_scheme_vector[sub_half]) result = SwitchBlock(modk, switch_map) ####################################################################### # LARGE ARGUMENT MANAGEMENT # # (lar: Large Argument Reduction) # ####################################################################### # payne and hanek argument reduction for large arguments #red_func_name = "payne_hanek_cosfp32" # "payne_hanek_fp32_asm" red_func_name = "payne_hanek_fp32_asm" payne_hanek_func_op = FunctionOperator(red_func_name, arg_map = {0: FO_Arg(0)}, require_header = ["support_lib/ml_red_arg.h"]) payne_hanek_func = FunctionObject(red_func_name, [ML_Binary32], ML_Binary64, payne_hanek_func_op) payne_hanek_func_op.declare_prototype = payne_hanek_func #large_arg_red = FunctionCall(payne_hanek_func, vx) large_arg_red = payne_hanek_func(vx) red_bound = S2**20 cond = Abs(vx) >= red_bound cond.set_attributes(tag = "cond", likely = False) lar_neark = NearestInteger(large_arg_red, precision = ML_Int64) lar_modk = Modulo(lar_neark, Constant(16, precision = ML_Int64), tag = "lar_modk", debug = True) # Modulo is supposed to be already performed (by payne_hanek_cosfp32) #lar_modk = NearestInteger(large_arg_red, precision = ML_Int64)
def generate_scheme(self): # declaring CodeFunction and retrieving input variable vx = Abs(self.implementation.add_input_variable("x", self.precision), tag="vx") Log.report(Log.Info, "generating implementation scheme") if self.debug_flag: Log.report(Log.Info, "debug has been enabled") # local overloading of RaiseReturn operation def ExpRaiseReturn(*args, **kwords): kwords["arg_value"] = vx kwords["function_name"] = self.function_name return RaiseReturn(*args, **kwords) debug_precision = { ML_Binary32: debug_ftox, ML_Binary64: debug_lftolx }[self.precision] test_nan_or_inf = Test(vx, specifier=Test.IsInfOrNaN, likely=False, debug=True, tag="nan_or_inf") test_nan = Test(vx, specifier=Test.IsNaN, debug=True, tag="is_nan_test") test_positive = Comparison(vx, 0, specifier=Comparison.GreaterOrEqual, debug=True, tag="inf_sign") test_signaling_nan = Test(vx, specifier=Test.IsSignalingNaN, debug=True, tag="is_signaling_nan") return_snan = Statement( ExpRaiseReturn(ML_FPE_Invalid, return_value=FP_QNaN(self.precision))) # return in case of infinity input infty_return = Statement( ConditionBlock(test_positive, Return(FP_PlusInfty(self.precision)), Return(FP_PlusZero(self.precision)))) # return in case of specific value input (NaN or inf) specific_return = ConditionBlock( test_nan, ConditionBlock(test_signaling_nan, return_snan, Return(FP_QNaN(self.precision))), infty_return) # return in case of standard (non-special) input sollya_precision = self.precision.get_sollya_object() hi_precision = self.precision.get_field_size() - 3 # argument reduction frac_pi_index = 3 frac_pi = round(S2**frac_pi_index / pi, sollya_precision, sollya.RN) inv_frac_pi = round(pi / S2**frac_pi_index, hi_precision, sollya.RN) inv_frac_pi_lo = round(pi / S2**frac_pi_index - inv_frac_pi, sollya_precision, sollya.RN) # computing k = E(x * frac_pi) vx_pi = Multiplication(vx, frac_pi, precision=self.precision) k = NearestInteger(vx_pi, precision=ML_Int32, tag="k", debug=True) fk = Conversion(k, precision=self.precision, tag="fk") inv_frac_pi_cst = Constant(inv_frac_pi, tag="inv_frac_pi", precision=self.precision) inv_frac_pi_lo_cst = Constant(inv_frac_pi_lo, tag="inv_frac_pi_lo", precision=self.precision) red_vx_hi = (vx - inv_frac_pi_cst * fk) red_vx_hi.set_attributes(tag="red_vx_hi", debug=debug_precision, precision=self.precision) red_vx_lo_sub = inv_frac_pi_lo_cst * fk red_vx_lo_sub.set_attributes(tag="red_vx_lo_sub", debug=debug_precision, unbreakable=True, precision=self.precision) vx_d = Conversion(vx, precision=ML_Binary64, tag="vx_d") pre_red_vx = red_vx_hi - inv_frac_pi_lo_cst * fk pre_red_vx_d_hi = (vx_d - inv_frac_pi_cst * fk) pre_red_vx_d_hi.set_attributes(tag="pre_red_vx_d_hi", precision=ML_Binary64, debug=debug_lftolx) pre_red_vx_d = pre_red_vx_d_hi - inv_frac_pi_lo_cst * fk pre_red_vx_d.set_attributes(tag="pre_red_vx_d", debug=debug_lftolx, precision=ML_Binary64) modk = Modulo(k, 2**(frac_pi_index + 1), precision=ML_Int32, tag="switch_value", debug=True) sel_c = Equal(BitLogicAnd(modk, 2**(frac_pi_index - 1)), 2**(frac_pi_index - 1)) red_vx = Select(sel_c, -pre_red_vx, pre_red_vx) red_vx.set_attributes(tag="red_vx", debug=debug_precision, precision=self.precision) red_vx_d = Select(sel_c, -pre_red_vx_d, pre_red_vx_d) red_vx_d.set_attributes(tag="red_vx_d", debug=debug_lftolx, precision=ML_Binary64) approx_interval = Interval(-pi / (S2**(frac_pi_index + 1)), pi / S2**(frac_pi_index + 1)) Log.report(Log.Info, "approx interval: %s\n" % approx_interval) error_goal_approx = S2**-self.precision.get_precision() Log.report(Log.Info, "building mathematical polynomial") poly_degree_vector = [None] * 2**(frac_pi_index + 1) error_function = lambda p, f, ai, mod, t: dirtyinfnorm(f - p, ai) #polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_estrin_scheme polynomial_scheme_builder = PolynomialSchemeEvaluator.generate_horner_scheme index_relative = [] poly_object_vector = [None] * 2**(frac_pi_index + 1) for i in range(2**(frac_pi_index + 1)): sub_func = cos(sollya.x + i * pi / S2**frac_pi_index) degree = int( sup(guessdegree(sub_func, approx_interval, error_goal_approx))) + 1 degree_list = range(degree + 1) a_interval = approx_interval if i == 0: # ad-hoc, TODO: to be cleaned degree = 6 degree_list = range(0, degree + 1, 2) elif i % 2**(frac_pi_index) == 2**(frac_pi_index - 1): # for pi/2 and 3pi/2, an approx to sin=cos(pi/2+x) # must be generated degree_list = range(1, degree + 1, 2) if i == 3 or i == 5 or i == 7 or i == 9: precision_list = [sollya.binary64 ] + [sollya.binary32] * (degree) else: precision_list = [sollya.binary32] * (degree + 1) poly_degree_vector[i] = degree constraint = sollya.absolute delta = (2**(frac_pi_index - 3)) centered_i = (i % 2**(frac_pi_index)) - 2**(frac_pi_index - 1) if centered_i < delta and centered_i > -delta and centered_i != 0: constraint = sollya.relative index_relative.append(i) Log.report( Log.Info, "generating approximation for %d/%d" % (i, 2**(frac_pi_index + 1))) poly_object_vector[ i], _ = Polynomial.build_from_approximation_with_error( sub_func, degree_list, precision_list, a_interval, constraint, error_function=error_function) # unified power map for red_sx^n upm = {} rel_error_list = [] poly_scheme_vector = [None] * (2**(frac_pi_index + 1)) for i in range(2**(frac_pi_index + 1)): poly_object = poly_object_vector[i] poly_precision = self.precision if i == 3 or i == 5 or i == 7 or i == 9: poly_precision = ML_Binary64 c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision=ML_Binary64) c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=self.precision) poly_hi = (c0 + c1 * red_vx) poly_hi.set_precision(ML_Binary64) red_vx_d_2 = red_vx_d * red_vx_d poly_scheme = poly_hi + red_vx_d_2 * polynomial_scheme_builder( poly_object.sub_poly(start_index=2, offset=2), red_vx, unified_precision=self.precision, power_map_=upm) poly_scheme.set_attributes(unbreakable=True) elif i == 4: c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=ML_Binary64) poly_scheme = c1 * red_vx_d + polynomial_scheme_builder( poly_object.sub_poly(start_index=2), red_vx, unified_precision=self.precision, power_map_=upm) poly_scheme.set_precision(ML_Binary64) else: poly_scheme = polynomial_scheme_builder( poly_object, red_vx, unified_precision=poly_precision, power_map_=upm) #if i == 3: # c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision = self.precision) # c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision = self.precision) # poly_scheme = (c0 + c1 * red_vx) + polynomial_scheme_builder(poly_object.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm) poly_scheme.set_attributes(tag="poly_cos%dpi%d" % (i, 2**(frac_pi_index)), debug=debug_precision) poly_scheme_vector[i] = poly_scheme #try: if is_gappa_installed() and i == 3: opt_scheme = self.opt_engine.optimization_process( poly_scheme, self.precision, copy=True, fuse_fma=self.fuse_fma) tag_map = {} self.opt_engine.register_nodes_by_tag(opt_scheme, tag_map) gappa_vx = Variable("red_vx", precision=self.precision, interval=approx_interval) cg_eval_error_copy_map = { tag_map["red_vx"]: gappa_vx, tag_map["red_vx_d"]: gappa_vx, } print "opt_scheme" print opt_scheme.get_str(depth=None, display_precision=True, memoization_map={}) eval_error = self.gappa_engine.get_eval_error_v2( self.opt_engine, opt_scheme, cg_eval_error_copy_map, gappa_filename="red_arg_%d.g" % i) poly_range = cos(approx_interval + i * pi / S2**frac_pi_index) rel_error_list.append(eval_error / poly_range) #for rel_error in rel_error_list: # print sup(abs(rel_error)) #return # case 17 #poly17 = poly_object_vector[17] #c0 = Constant(coeff(poly17.get_sollya_object(), 0), precision = self.precision) #c1 = Constant(coeff(poly17.get_sollya_object(), 1), precision = self.precision) #poly_scheme_vector[17] = FusedMultiplyAdd(c1, red_vx, c0, specifier = FusedMultiplyAdd.Standard) + polynomial_scheme_builder(poly17.sub_poly(start_index = 2), red_vx, unified_precision = self.precision, power_map_ = upm) half = 2**frac_pi_index sub_half = 2**(frac_pi_index - 1) # determine if the reduced input is within the second and third quarter (not first nor fourth) # to negate the cosine output factor_cond = BitLogicAnd(BitLogicXor( BitLogicRightShift(modk, frac_pi_index), BitLogicRightShift(modk, frac_pi_index - 1)), 1, tag="factor_cond", debug=True) CM1 = Constant(-1, precision=self.precision) C1 = Constant(1, precision=self.precision) factor = Select(factor_cond, CM1, C1, tag="factor", debug=debug_precision) factor2 = Select(Equal(modk, Constant(sub_half)), CM1, C1, tag="factor2", debug=debug_precision) switch_map = {} if 0: for i in range(2**(frac_pi_index + 1)): switch_map[i] = Return(poly_scheme_vector[i]) else: for i in range(2**(frac_pi_index - 1)): switch_case = (i, half - i) #switch_map[i] = Return(poly_scheme_vector[i]) #switch_map[half-i] = Return(-poly_scheme_vector[i]) if i != 0: switch_case = switch_case + (half + i, 2 * half - i) #switch_map[half+i] = Return(-poly_scheme_vector[i]) #switch_map[2*half-i] = Return(poly_scheme_vector[i]) if poly_scheme_vector[i].get_precision() != self.precision: poly_result = Conversion(poly_scheme_vector[i], precision=self.precision) else: poly_result = poly_scheme_vector[i] switch_map[switch_case] = Return(factor * poly_result) #switch_map[sub_half] = Return(-poly_scheme_vector[sub_half]) #switch_map[half + sub_half] = Return(poly_scheme_vector[sub_half]) switch_map[(sub_half, half + sub_half)] = Return( factor2 * poly_scheme_vector[sub_half]) result = SwitchBlock(modk, switch_map) ####################################################################### # LARGE ARGUMENT MANAGEMENT # # (lar: Large Argument Reduction) # ####################################################################### # payne and hanek argument reduction for large arguments #red_func_name = "payne_hanek_cosfp32" # "payne_hanek_fp32_asm" red_func_name = "payne_hanek_fp32_asm" payne_hanek_func_op = FunctionOperator( red_func_name, arg_map={0: FO_Arg(0)}, require_header=["support_lib/ml_red_arg.h"]) payne_hanek_func = FunctionObject(red_func_name, [ML_Binary32], ML_Binary64, payne_hanek_func_op) payne_hanek_func_op.declare_prototype = payne_hanek_func #large_arg_red = FunctionCall(payne_hanek_func, vx) large_arg_red = payne_hanek_func(vx) red_bound = S2**20 cond = Abs(vx) >= red_bound cond.set_attributes(tag="cond", likely=False) lar_neark = NearestInteger(large_arg_red, precision=ML_Int64) lar_modk = Modulo(lar_neark, Constant(16, precision=ML_Int64), tag="lar_modk", debug=True) # Modulo is supposed to be already performed (by payne_hanek_cosfp32) #lar_modk = NearestInteger(large_arg_red, precision = ML_Int64) pre_lar_red_vx = large_arg_red - Conversion(lar_neark, precision=ML_Binary64) pre_lar_red_vx.set_attributes(precision=ML_Binary64, debug=debug_lftolx, tag="pre_lar_red_vx") lar_red_vx = Conversion(pre_lar_red_vx, precision=self.precision, debug=debug_precision, tag="lar_red_vx") lar_red_vx_lo = Conversion( pre_lar_red_vx - Conversion(lar_red_vx, precision=ML_Binary64), precision=self.precision) lar_red_vx_lo.set_attributes(tag="lar_red_vx_lo", precision=self.precision) lar_k = 3 # large arg reduction Universal Power Map lar_upm = {} lar_switch_map = {} approx_interval = Interval(-0.5, 0.5) for i in range(2**(lar_k + 1)): frac_pi = pi / S2**lar_k func = cos(frac_pi * i + frac_pi * sollya.x) degree = 6 error_mode = sollya.absolute if i % 2**(lar_k) == 2**(lar_k - 1): # close to sin(x) cases func = -sin(frac_pi * x) if i == 2**(lar_k - 1) else sin(frac_pi * x) degree_list = range(0, degree + 1, 2) precision_list = [sollya.binary32] * len(degree_list) poly_object, _ = Polynomial.build_from_approximation_with_error( func / x, degree_list, precision_list, approx_interval, error_mode) poly_object = poly_object.sub_poly(offset=-1) else: degree_list = range(degree + 1) precision_list = [sollya.binary32] * len(degree_list) poly_object, _ = Polynomial.build_from_approximation_with_error( func, degree_list, precision_list, approx_interval, error_mode) if i == 3 or i == 5 or i == 7 or i == 9 or i == 11 or i == 13: poly_precision = ML_Binary64 c0 = Constant(coeff(poly_object.get_sollya_object(), 0), precision=ML_Binary64) c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=self.precision) poly_hi = (c0 + c1 * lar_red_vx) poly_hi.set_precision(ML_Binary64) pre_poly_scheme = poly_hi + polynomial_scheme_builder( poly_object.sub_poly(start_index=2), lar_red_vx, unified_precision=self.precision, power_map_=lar_upm) pre_poly_scheme.set_attributes(precision=ML_Binary64) poly_scheme = Conversion(pre_poly_scheme, precision=self.precision) elif i == 4 or i == 12: c1 = Constant(coeff(poly_object.get_sollya_object(), 1), precision=self.precision) c3 = Constant(coeff(poly_object.get_sollya_object(), 3), precision=self.precision) c5 = Constant(coeff(poly_object.get_sollya_object(), 5), precision=self.precision) poly_hi = polynomial_scheme_builder( poly_object.sub_poly(start_index=3), lar_red_vx, unified_precision=self.precision, power_map_=lar_upm) poly_hi.set_attributes(tag="poly_lar_%d_hi" % i, precision=ML_Binary64) poly_scheme = Conversion(FusedMultiplyAdd( c1, lar_red_vx, poly_hi, precision=ML_Binary64) + c1 * lar_red_vx_lo, precision=self.precision) else: poly_scheme = polynomial_scheme_builder( poly_object, lar_red_vx, unified_precision=self.precision, power_map_=lar_upm) # poly_scheme = polynomial_scheme_builder(poly_object, lar_red_vx, unified_precision = self.precision, power_map_ = lar_upm) poly_scheme.set_attributes(tag="lar_poly_%d" % i, debug=debug_precision) lar_switch_map[(i, )] = Return(poly_scheme) lar_result = SwitchBlock(lar_modk, lar_switch_map) # main scheme #Log.report(Log.Info, "\033[33;1m MDL scheme \033[0m") # scheme = Statement(ConditionBlock(cond, lar_result, result)) Log.report(Log.Info, "Construction of the initial MDL scheme") scheme = Statement(pre_red_vx_d, red_vx_lo_sub, ConditionBlock(cond, lar_result, result)) return scheme
def generate_bench(self, processor, test_num=1000, unroll_factor=10): """ generate performance bench for self.op_class """ initial_inputs = [ Constant(random.uniform(inf(self.init_interval), sup(self.init_interval)), precision=precision) for i, precision in enumerate(self.input_precisions) ] var_inputs = [ Variable("var_%d" % i, precision=FormatAttributeWrapper(precision, ["volatile"]), var_type=Variable.Local) for i, precision in enumerate(self.input_precisions) ] printf_timing_op = FunctionOperator( "printf", arg_map={ 0: "\"%s[%s] %%lld elts computed "\ "in %%lld cycles =>\\n %%.3f CPE \\n\"" % ( self.bench_name, self.output_precision.get_display_format() ), 1: FO_Arg(0), 2: FO_Arg(1), 3: FO_Arg(2), 4: FO_Arg(3) }, void_function=True ) printf_timing_function = FunctionObject( "printf", [self.output_precision, ML_Int64, ML_Int64, ML_Binary64], ML_Void, printf_timing_op) timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local) void_function_op = FunctionOperator("(void)", arity=1, void_function=True) void_function = FunctionObject("(void)", [self.output_precision], ML_Void, void_function_op) # initialization of operation inputs init_assign = metaop.Statement() for var_input, init_value in zip(var_inputs, initial_inputs): init_assign.push(ReferenceAssign(var_input, init_value)) # test loop loop_i = Variable("i", precision=ML_Int64, var_type=Variable.Local) test_num_cst = Constant(test_num / unroll_factor, precision=ML_Int64, tag="test_num") # Goal build a chain of dependant operation to measure # elementary operation latency local_inputs = tuple(var_inputs) local_result = self.op_class(*local_inputs, precision=self.output_precision, unbreakable=True) for i in range(unroll_factor - 1): local_inputs = tuple([local_result] + var_inputs[1:]) local_result = self.op_class(*local_inputs, precision=self.output_precision, unbreakable=True) # renormalisation local_result = self.renorm_function(local_result) # variable assignation to build dependency chain var_assign = Statement() var_assign.push(ReferenceAssign(var_inputs[0], local_result)) final_value = var_inputs[0] # loop increment value loop_increment = 1 test_loop = Loop( ReferenceAssign(loop_i, Constant(0, precision=ML_Int32)), loop_i < test_num_cst, Statement(var_assign, ReferenceAssign(loop_i, loop_i + loop_increment)), ) # bench scheme test_scheme = Statement( ReferenceAssign(timer, processor.get_current_timestamp()), init_assign, test_loop, ReferenceAssign( timer, Subtraction(processor.get_current_timestamp(), timer, precision=ML_Int64)), # prevent intermediary variable simplification void_function(final_value), printf_timing_function( final_value, Constant(test_num, precision=ML_Int64), timer, Division(Conversion(timer, precision=ML_Binary64), Constant(test_num, precision=ML_Binary64), precision=ML_Binary64)) # ,Return(Constant(0, precision = ML_Int32)) ) return test_scheme
def generate_bench_wrapper(self, test_num=1, loop_num=100000, test_ranges=[Interval(-1.0, 1.0)], debug=False): # interval where the array lenght is chosen from (randomly) index_range = self.test_index_range auto_test = CodeFunction("bench_wrapper", output_format=ML_Binary64) tested_function = self.implementation.get_function_object() function_name = self.implementation.get_name() failure_report_op = FunctionOperator("report_failure") failure_report_function = FunctionObject("report_failure", [], ML_Void, failure_report_op) printf_success_op = FunctionOperator( "printf", arg_map={0: "\"test successful %s\\n\"" % function_name}, void_function=True) printf_success_function = FunctionObject("printf", [], ML_Void, printf_success_op) output_precision = FormatAttributeWrapper(self.precision, ["volatile"]) test_total = test_num # number of arrays expected as inputs for tested_function NUM_INPUT_ARRAY = 1 # position of the input array in tested_function operands (generally # equals to 1 as to 0-th input is often the destination array) INPUT_INDEX_OFFSET = 1 # concatenating standard test array at the beginning of randomly # generated array TABLE_SIZE_VALUES = [ len(std_table) for std_table in self.standard_test_cases ] + [ random.randrange(index_range[0], index_range[1] + 1) for i in range(test_num) ] OFFSET_VALUES = [sum(TABLE_SIZE_VALUES[:i]) for i in range(test_total)] table_size_offset_array = generate_2d_table( test_total, 2, ML_UInt32, self.uniquify_name("table_size_array"), value_gen=(lambda row_id: (TABLE_SIZE_VALUES[row_id], OFFSET_VALUES[row_id]))) INPUT_ARRAY_SIZE = sum(TABLE_SIZE_VALUES) # TODO/FIXME: implement proper input range depending on input index # assuming a single input array input_precisions = [self.get_input_precision(1).get_data_precision()] rng_map = [ get_precision_rng(precision, inf(test_range), sup(test_range)) for precision, test_range in zip(input_precisions, test_ranges) ] # generated table of inputs input_tables = [ generate_1d_table( INPUT_ARRAY_SIZE, self.get_input_precision(INPUT_INDEX_OFFSET + table_id).get_data_precision(), self.uniquify_name("input_table_arg%d" % table_id), value_gen=( lambda _: input_precisions[table_id].round_sollya_object( rng_map[table_id].get_new_value(), sollya.RN))) for table_id in range(NUM_INPUT_ARRAY) ] # generate output_array output_array = generate_1d_table( INPUT_ARRAY_SIZE, output_precision, self.uniquify_name("output_array"), #value_gen=(lambda _: FP_QNaN(self.precision)) value_gen=(lambda _: None), const=False, empty=True) # accumulate element number acc_num = Variable("acc_num", precision=ML_Int64, var_type=Variable.Local) def empty_post_statement_gen(input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id): return Statement() test_loop = self.get_array_test_wrapper(test_total, tested_function, table_size_offset_array, input_tables, output_array, acc_num, empty_post_statement_gen) timer = Variable("timer", precision=ML_Int64, var_type=Variable.Local) printf_timing_op = FunctionOperator( "printf", arg_map={ 0: "\"%s %%\"PRIi64\" elts computed in %%\"PRIi64\" nanoseconds => %%.3f CPE \\n\"" % function_name, 1: FO_Arg(0), 2: FO_Arg(1), 3: FO_Arg(2) }, void_function=True) printf_timing_function = FunctionObject( "printf", [ML_Int64, ML_Int64, ML_Binary64], ML_Void, printf_timing_op) vj = Variable("j", precision=ML_Int32, var_type=Variable.Local) loop_num_cst = Constant(loop_num, precision=ML_Int32, tag="loop_num") loop_increment = 1 # bench measure of clock per element cpe_measure = Division( Conversion(timer, precision=ML_Binary64), Conversion(acc_num, precision=ML_Binary64), precision=ML_Binary64, tag="cpe_measure", ) # common test scheme between scalar and vector functions test_scheme = Statement( self.processor.get_init_timestamp(), ReferenceAssign(timer, self.processor.get_current_timestamp()), ReferenceAssign(acc_num, 0), Loop( ReferenceAssign(vj, Constant(0, precision=ML_Int32)), vj < loop_num_cst, Statement(test_loop, ReferenceAssign(vj, vj + loop_increment))), ReferenceAssign( timer, Subtraction(self.processor.get_current_timestamp(), timer, precision=ML_Int64)), printf_timing_function( Conversion(acc_num, precision=ML_Int64), timer, cpe_measure, ), Return(cpe_measure), # Return(Constant(0, precision = ML_Int32)) ) auto_test.set_scheme(test_scheme) return FunctionGroup([auto_test])
def generate_array_check_loop(self, input_tables, output_array, table_size_offset_array, array_offset, array_len, test_id): # internal array iterator index vj = Variable("j", precision=ML_UInt32, var_type=Variable.Local) printf_input_function = self.get_printf_input_function() printf_error_template = "printf(\"max %s error is %s \\n\", %s)" % ( self.function_name, self.precision.get_display_format().format_string, self.precision.get_display_format().pre_process_fct("{0}")) printf_error_op = TemplateOperatorFormat(printf_error_template, arity=1, void_function=True, require_header=["stdio.h"]) printf_error_function = FunctionObject("printf", [self.precision], ML_Void, printf_error_op) printf_max_op = FunctionOperator( "printf", arg_map={ 0: "\"max %s error is reached at input number %s \\n \"" % (self.function_name, "%d"), 1: FO_Arg(0) }, void_function=True, require_header=["stdio.h"]) printf_max_function = FunctionObject("printf", [self.precision], ML_Void, printf_max_op) NUM_INPUT_ARRAY = len(input_tables) # generate the expected table for the whole multi-array expected_table = self.generate_expected_table(input_tables, table_size_offset_array) # inputs for the (vj)-th entry of the sub-arrat local_inputs = tuple( TableLoad(input_tables[in_id], array_offset + vj) for in_id in range(NUM_INPUT_ARRAY)) # expected values for the (vj)-th entry of the sub-arrat expected_values = [ TableLoad(expected_table, array_offset + vj, i) for i in range(self.accuracy.get_num_output_value()) ] # local result for the (vj)-th entry of the sub-arrat local_result = TableLoad(output_array, array_offset + vj) if self.break_error: return_statement_break = Statement( printf_input_function(*((vj, ) + local_inputs + (local_result, ))), self.accuracy.get_output_print_call(self.function_name, output_values)) else: return_statement_break = Statement( printf_input_function(*((vj, ) + local_inputs + (local_result, ))), self.accuracy.get_output_print_call(self.function_name, expected_values), Return(Constant(1, precision=ML_Int32))) # loop implementation to check sub-array array_offset # results validity check_array_loop = Loop( ReferenceAssign(vj, 0), vj < array_len, Statement( ConditionBlock( self.accuracy.get_output_check_test( local_result, expected_values), return_statement_break), ReferenceAssign(vj, vj + 1), )) return check_array_loop
type_strict_match(v4float32, v4float32, v4float32): StandardAsmOperator("faddwq {} = {}, {}", arity=2, exu=DA_FPU), }, }, }, Subtraction: { None: { lambda _: True: { type_strict_match_list([ML_Int32, ML_UInt32], [ ML_UInt32, ML_Int32 ], [ML_Int32, ML_UInt32]): # subtract from: op1 - op0 (reverse) AdvancedAsmOperator("sbfw {} = {}, {}", arg_map={ 0: FO_Result(), 1: FO_Arg(1), 2: FO_Arg(0) }, arity=2, exu=DA_ALU), type_strict_match(ML_Binary32, ML_Binary32, ML_Binary32): # subtract from: op1 - op0 (reverse) AdvancedAsmOperator("fsbfw {} = {}, {}", arg_map={ 0: FO_Result(), 1: FO_Arg(1), 2: FO_Arg(0) }, arity=2, exu=DA_FPU), type_strict_match(ML_Binary64, ML_Binary64, ML_Binary64):
def generate_datafile_testbench(self, tc_list, io_map, input_signals, output_signals, time_step, test_fname="test.input"): """ Generate testbench with input and output data externalized in a data file """ # textio function to read hexadecimal text def FCT_HexaRead_gen(input_format): legalized_input_format = input_format FCT_HexaRead = FunctionObject("hread", [HDL_LINE, legalized_input_format], ML_Void, FunctionOperator("hread", void_function=True, arity=2)) return FCT_HexaRead # textio function to read binary text FCT_Read = FunctionObject("read", [HDL_LINE, ML_StdLogic], ML_Void, FunctionOperator("read", void_function=True, arity=2)) input_line = Variable("input_line", precision=HDL_LINE, var_type=Variable.Local) # building ordered list of input and output signal names input_signal_list = [sname for sname in input_signals.keys()] input_statement = Statement() for input_name in input_signal_list: input_format = input_signals[input_name].precision input_var = Variable( "v_" + input_name, precision=input_format, var_type=Variable.Local) if input_format is ML_StdLogic: input_statement.add(FCT_Read(input_line, input_var)) else: input_statement.add(FCT_HexaRead_gen(input_format)(input_line, input_var)) input_statement.add(ReferenceAssign(input_signals[input_name], input_var)) output_signal_list = [sname for sname in output_signals.keys()] output_statement = Statement() for output_name in output_signal_list: output_format = output_signals[output_name].precision output_var = Variable( "v_" + output_name, precision=output_format, var_type=Variable.Local) if output_format is ML_StdLogic: output_statement.add(FCT_Read(input_line, output_var)) else: output_statement.add(FCT_HexaRead_gen(output_format)(input_line, output_var)) output_signal = output_signals[output_name] #value_msg = get_output_value_msg(output_signal, output_value) test_pass_cond, check_statement = get_output_check_statement(output_signal, output_name, output_var) input_msg = multi_Concatenation(*tuple(sum([[" %s=" % input_tag, signal_str_conversion(input_signals[input_tag], input_signals[input_tag].precision)] for input_tag in input_signal_list], []))) output_statement.add(check_statement) assert_statement = Assert( test_pass_cond, multi_Concatenation( "unexpected value for inputs ", input_msg, " expecting :", signal_str_conversion(output_var, output_format), " got :", signal_str_conversion(output_signal, output_format), precision = ML_String ), severity=Assert.Failure ) output_statement.add(assert_statement) self_component = self.implementation.get_component_object() self_instance = self_component(io_map = io_map, tag = "tested_entity") test_statement = Statement() DATA_FILE_NAME = test_fname with open(DATA_FILE_NAME, "w") as data_file: # dumping column tags data_file.write("# " + " ".join(input_signal_list + output_signal_list) + "\n") def get_raw_cst_string(cst_format, cst_value): size = int((cst_format.get_bit_size() + 3) / 4) return ("{:x}").format(cst_format.get_base_format().get_integer_coding(cst_value)).zfill(size) for input_values, output_values in tc_list: # TODO; generate test data file cst_list = [] for input_name in input_signal_list: input_value = input_values[input_name] input_format = input_signals[input_name].get_precision() cst_list.append(get_raw_cst_string(input_format, input_value)) for output_name in output_signal_list: output_value = output_values[output_name] output_format = output_signals[output_name].get_precision() cst_list.append(get_raw_cst_string(output_format, output_value)) # dumping line into file data_file.write(" ".join(cst_list) + "\n") input_stream = Variable("data_file", precision=HDL_FILE, var_type=Variable.Local) file_status = Variable("file_status", precision=HDL_OPEN_FILE_STATUS, var_type=Variable.Local) FCT_EndFile = FunctionObject("endfile", [HDL_FILE], ML_Bool, FunctionOperator("endfile", arity=1)) FCT_OpenFile = FunctionObject( "FILE_OPEN", [HDL_OPEN_FILE_STATUS, HDL_FILE, ML_String], ML_Void, FunctionOperator( "FILE_OPEN", arg_map={0: FO_Arg(0), 1: FO_Arg(1), 2: FO_Arg(2), 3: "READ_MODE"}, void_function=True)) FCT_ReadLine = FunctionObject( "readline", [HDL_FILE, HDL_LINE], ML_Void, FunctionOperator("readline", void_function=True, arity=2)) reset_statement = self.get_reset_statement(io_map, time_step) OPEN_OK = Constant("OPEN_OK", precision=HDL_OPEN_FILE_STATUS) testbench = CodeEntity("testbench") test_process = Process( reset_statement, FCT_OpenFile(file_status, input_stream, DATA_FILE_NAME), ConditionBlock( Comparison(file_status, OPEN_OK, specifier=Comparison.NotEqual), Assert( Constant(0, precision=ML_Bool), " \"failed to open file {}\"".format(DATA_FILE_NAME), severity=Assert.Failure ) ), # consume legend line FCT_ReadLine(input_stream, input_line), WhileLoop( LogicalNot(FCT_EndFile(input_stream)), Statement( FCT_ReadLine(input_stream, input_line), input_statement, Wait(time_step * (self.stage_num + 2)), output_statement, ), ), # end of test Assert( Constant(0, precision = ML_Bool), " \"end of test, no error encountered \"", severity = Assert.Warning ), # infinite end loop WhileLoop( Constant(1, precision=ML_Bool), Statement( Wait(time_step * (self.stage_num + 2)), ) ) ) testbench_scheme = Statement( self_instance, test_process ) if self.pipelined: half_time_step = time_step / 2 assert (half_time_step * 2) == time_step # adding clock process for pipelined bench clk_process = Process( Statement( ReferenceAssign( io_map["clk"], Constant(1, precision = ML_StdLogic) ), Wait(half_time_step), ReferenceAssign( io_map["clk"], Constant(0, precision = ML_StdLogic) ), Wait(half_time_step), ) ) testbench_scheme.push(clk_process) testbench.add_process(testbench_scheme) return [testbench]