def cint_cint_division(a, b, k, f): """ Goldschmidt method implemented with SE aproximation: http://stackoverflow.com/questions/2661541/picking-good-first-estimates-for-goldschmidt-division """ # theta can be replaced with something smaller # for safety we assume that is the same theta from previous GS method theta = int(ceil(log(k/3.5) / log(2))) two = cint(2) * two_power(f) sign_b = cint(1) - 2 * cint(b < 0) sign_a = cint(1) - 2 * cint(a < 0) absolute_b = b * sign_b absolute_a = a * sign_a w0 = approximate_reciprocal(absolute_b, k, f, theta) A = Array(theta, cint) B = Array(theta, cint) W = Array(theta, cint) A[0] = absolute_a B[0] = absolute_b W[0] = w0 @for_range(1, theta) def block(i): A[i] = (A[i - 1] * W[i - 1]) >> f B[i] = (B[i - 1] * W[i - 1]) >> f W[i] = two - B[i] return (sign_a * sign_b) * A[theta - 1]
def sint_cint_division(a, b, k, f, kappa): """ type(a) = sint, type(b) = cint """ theta = int(ceil(log(k/3.5) / log(2))) two = cint(2) * two_power(f) sign_b = cint(1) - 2 * cint(b < 0) sign_a = sint(1) - 2 * sint(a < 0) absolute_b = b * sign_b absolute_a = a * sign_a w0 = approximate_reciprocal(absolute_b, k, f, theta) A = Array(theta, sint) B = Array(theta, cint) W = Array(theta, cint) A[0] = absolute_a B[0] = absolute_b W[0] = w0 @for_range(1, theta) def block(i): A[i] = TruncPr(A[i - 1] * W[i - 1], 2*k, f, kappa) temp = (B[i - 1] * W[i - 1]) >> f # no reading and writing to the same variable in a for loop. W[i] = two - temp B[i] = temp return (sign_a * sign_b) * A[theta - 1]
def decorator(loop_body): thread_rounds = n_loops / n_threads remainder = n_loops % n_threads for t in thread_mem_req: if t != regint: raise CompilerError('Not implemented for other than regint') args = Matrix(n_threads, 2 + thread_mem_req.get(regint, 0), 'ci') state = tuple(initializer()) def f(inc): if thread_mem_req: thread_mem = Array(thread_mem_req[regint], regint, \ args[get_arg()].address + 2) mem_state = Array(len(state), type(state[0]) \ if state else cint, args[get_arg()][1]) base = args[get_arg()][0] @map_reduce_single(n_parallel, thread_rounds + inc, \ initializer, reducer, mem_state) def f(i): if thread_mem_req: return loop_body(base + i, thread_mem) else: return loop_body(base + i) prog = get_program() threads = [] if thread_rounds: tape = prog.new_tape(f, (0, ), 'multithread') for i in range(n_threads - remainder): mem_state = make_array(initializer()) args[remainder + i][0] = i * thread_rounds if len(mem_state): args[remainder + i][1] = mem_state.address threads.append(prog.run_tape(tape, remainder + i)) if remainder: tape1 = prog.new_tape(f, (1, ), 'multithread1') for i in range(remainder): mem_state = make_array(initializer()) args[i][0] = (n_threads - remainder + i) * thread_rounds + i if len(mem_state): args[i][1] = mem_state.address threads.append(prog.run_tape(tape1, i)) for thread in threads: prog.join_tape(thread) if state: if thread_rounds: for i in range(n_threads - remainder): state = reducer(Array(len(state), type(state[0]), \ args[remainder + i][1]), state) if remainder: for i in range(remainder): state = reducer(Array(len(state), type(state[0]).reg_type, \ args[i][1]), state) def returner(): return untuplify(state) return returner
def make_array(l): if isinstance(l, program.Tape.Register): res = Array(1, type(l)) res[0] = l else: l = list(l) res = Array(len(l), type(l[0]) if l else cint) res.assign(l) return res
def approximate_reciprocal(divisor, k, f, theta): """ returns aproximation of 1/divisor where type(divisor) = cint """ def twos_complement(x): bits = x.bit_decompose(k)[::-1] bit_array = Array(k, cint) bit_array.assign(bits) twos_result = MemValue(cint(0)) @for_range(k) def block(i): val = twos_result.read() val <<= 1 val += 1 - bit_array[i] twos_result.write(val) return twos_result.read() + 1 bit_array = Array(k, cint) bits = divisor.bit_decompose(k)[::-1] bit_array.assign(bits) cnt_leading_zeros = MemValue(regint(0)) flag = MemValue(regint(0)) cnt_leading_zeros = MemValue(regint(0)) normalized_divisor = MemValue(divisor) @for_range(k) def block(i): flag.write(flag.read() | bit_array[i] == 1) @if_(flag.read() == 0) def block(): cnt_leading_zeros.write(cnt_leading_zeros.read() + 1) normalized_divisor.write(normalized_divisor << 1) q = MemValue(two_power(k)) e = MemValue(twos_complement(normalized_divisor.read())) @for_range(theta) def block(i): qread = q.read() eread = e.read() qread += (qread * eread) >> k eread = (eread * eread) >> k q.write(qread) e.write(eread) res = q >> (2 * k - 2 * f - cnt_leading_zeros) return res
def approximate_reciprocal(divisor, k, f, theta): """ returns aproximation of 1/divisor where type(divisor) = cint """ def twos_complement(x): bits = x.bit_decompose(k)[::-1] bit_array = Array(k, cint) bit_array.assign(bits) twos_result = MemValue(cint(0)) @for_range(k) def block(i): val = twos_result.read() val <<= 1 val += 1 - bit_array[i] twos_result.write(val) return twos_result.read() + 1 bit_array = Array(k, cint) bits = divisor.bit_decompose(k)[::-1] bit_array.assign(bits) cnt_leading_zeros = MemValue(regint(0)) flag = MemValue(regint(0)) cnt_leading_zeros = MemValue(regint(0)) normalized_divisor = MemValue(divisor) @for_range(k) def block(i): flag.write(flag.read() | bit_array[i] == 1) @if_(flag.read() == 0) def block(): cnt_leading_zeros.write(cnt_leading_zeros.read() + 1) normalized_divisor.write(normalized_divisor << 1) q = MemValue(two_power(k)) e = MemValue(twos_complement(normalized_divisor.read())) qr = q.read() er = e.read() for i in range(theta): qr = qr + shift_two(qr * er, k) er = shift_two(er * er, k) q = qr res = shift_two(q, (2*k - 2*f - cnt_leading_zeros)) return res
def f(inc): if thread_mem_req: thread_mem = Array(thread_mem_req[regint], regint, \ args[get_arg()].address + 2) mem_state = Array(len(state), type(state[0]) \ if state else cint, args[get_arg()][1]) base = args[get_arg()][0] @map_reduce_single(n_parallel, thread_rounds + inc, \ initializer, reducer, mem_state) def f(i): if thread_mem_req: return loop_body(base + i, thread_mem) else: return loop_body(base + i)
def twos_complement(x): bits = x.bit_decompose(k)[::-1] bit_array = Array(k, cint) bit_array.assign(bits) twos_result = MemValue(cint(0)) @for_range(k) def block(i): val = twos_result.read() val <<= 1 val += 1 - bit_array[i] twos_result.write(val) return twos_result.read() + 1
def mergesort(A): B = Array(len(A), sint) def merge(i_left, i_right, i_end): i0 = MemValue(i_left) i1 = MemValue(i_right) @for_range(i_left, i_end) def loop(j): if_then(and_(lambda: i0 < i_right, or_(lambda: i1 >= i_end, lambda: regint(reveal(A[i0] <= A[i1]))))) B[j] = A[i0] i0.iadd(1) else_then() B[j] = A[i1] i1.iadd(1) end_if() width = MemValue(1) @do_while def width_loop(): @for_range(0, len(A), 2 * width) def merge_loop(i): merge(i, i + width, i + 2 * width) A.assign(B) width.imul(2) return width < len(A)
def test_while(): num_vals = 5 counter = MemValue(sint(num_vals - 1)) source_arr = Array(num_vals, sint) for i in range(num_vals): source_arr[i] = sint(i) target_arr = Array(num_vals, sint) @do_while def body(): counter_val = counter.read() counter_val_open = counter_val.reveal() target_arr[counter_val_open] = source_arr[counter_val_open] + 1 counter.write(counter_val - 1) opened = counter.reveal() return opened >= 0 runtime_assert_arr_equals([1, 2, 3, 4, 5], target_arr, default_test_name())
def __init__(self, *args): Array.__init__(self, *args)
def gen_dummy_cols(num_rows, num_cols): """Generates list of column arrays for given dimensions.""" cols = [Array(num_rows, sint) for _ in range(num_cols)] for col in cols: col.assign_all(1) return cols