def char_lit_rom_mode(): """C-LIT - word that reads a byte encoded in the thread, and pushes it to the stack""" label("forth.internal.C-LIT") adda(-(cost_of_char_lit_rom_mode // 2)) ld(-(cost_of_char_lit_rom_mode // 2)) C("Store cost") st([tmp0]) ld([data_stack_pointer]) C("Decrement Data stack pointer and store high byte of 0") suba(1) # 5 ld(AC, X) ld(0) st([X]) ld([data_stack_pointer]) suba(2) # 10 ld(AC, X) st([data_stack_pointer]) ld([IP_hi], Y) C("Jump to the code in the thread") ld(5) C("We're going to shift the IP by 5") nop( ) # 15, to meet requirement of move-ip that we must use an even number of cycles jmp(Y, [IP_lo]) ld(0x00, Y) # 17
def _left_shift_by_n(): """Fixed cost routine to do a left-shift by 1-7 places Shift amount is passed in NEGATED in ac, value is loaded from [Y, X] Control is returned to address in continuation """ label("left-shift-by-n") # Because we do n shift operations, with 0 < n < 8 # we need to balance it with 7 - n nops - so that we always do # 7 ops in total adda(lo(".end-of-left-shifts")) # 1 st([tmp0]) # Where we jump in the left-shifts suba(lo(".end-of-left-shifts") - 7) xora(0xFF) # ac = -(shift-amount) + 7; Negate it. adda(lo(".end-of-nops") + 1) # 5; +1 is to finish two's complement bra(AC) # 6 ld([tmp0]) # 7 ; Shift by 1 nop() # Shift by 2 nop() # Shift by 3 nop() # Shift by 4 nop() # Shift by 5 nop() # Shift by 6 label(".end-of-nops") bra(AC) # 8; ld([Y, X]) # 9 adda(AC) # Shift by 7 adda(AC) # Shift by 6 adda(AC) # Shift by 5 adda(AC) # Shift by 4 adda(AC) # Shift by 3 adda(AC) # Shift by 2 bra([continuation]) # 10 # Shift by 1 label(".end-of-left-shifts") adda(AC) # (counted as one of the 7)
def _push_ip_to_return_stack(): ld([return_stack_pointer]) C("Y holds the page of the return stack") C("Push [IP] to Return stack") suba(2) st([return_stack_pointer], X) ld([IP_lo]) st([Y, Xpp]) ld([IP_hi]) st([Y, X])
def exit(vTicks, vReturn): label("forth.exit") # Counting down label("forth.exit.from-failed-test") ld(-(cost_of_failed_next1 + 1) / 2) # 7 label("forth.exit.from-next1-reenter") label("forth.exit.from-next2") adda([vTicks]) # 6 ld(hi("vBlankStart"), Y) # 5 bgt(pc() & 0xFF) # 4 suba(1) # 3 jmp(Y, [vReturn]) # 2 nop() # 1
def two_dup(): label("forth.core.2DUP") adda(-add_cost_of_next(cost_of_2dup) / 2) # 1 ld(data_stack_page, Y) ld([data_stack_pointer], X) # 3 for tmp in [tmp0, tmp1, tmp2, tmp3]: ld([Y, X]) st([tmp]) st([Y, Xpp]) # 15 = 3 + 4 * 3 ld([data_stack_pointer]) suba(4) st([data_stack_pointer], X) # 18 for tmp in [tmp0, tmp1, tmp2, tmp3]: ld([tmp]) st([Y, Xpp]) # 26 = 18 + 4 * 2 NEXT(cost_of_2dup)
def next1(vTicks): """Routine to make continue or abort decisions, and dispatch to the next word""" # Invariant - on entry the vTicks variable and the accumulator both hold # an accurate number of cycles until we must be back in the display loop, # starting from the first instruction of this routine. # This value will always be greater than the cost of failing continue/abort test. This is true # whenever we return here from another word, and true when we first enter from the # display loop. label("forth.next1") C( "Timing point: [vTicks] == AC == accurate number of ticks until we need to be back" ) suba((cost_of_successful_test + cost_of_failfast) / 2) # 1 ld([W_hi], Y) # 2 jmp(Y, [W_lo]) # 3 bra("forth.restart-or-quit") # 4
def next1_reenter(vTicks): label("forth.next1.reenter") label( "forth.next1.reenter.even" ) # When a word took an even number of cycles, enter here nop() # 1 label( "forth.next1.reenter.odd" ) # Inbound code should round down ticks, because counting is from .even suba((cost_of_successful_test + cost_of_next1_reenter_success) / 2) # 2 adda([vTicks]) # 3 st([vTicks]) # 4; If we exit successfully we'll be ready for next1 suba(cost_of_failed_test / 2) # 5 blt(lo("forth.exit.from-next1-reenter")) # 6 vticks_error = cost_of_next1_reenter_success - cost_of_next1_reenter_failure ld((vticks_error / 2)) # 7 ; load vTicks wrongness into A bra(lo("forth.next1")) # 8 ld([vTicks]) # 9
def decrement(): "Subtract one from the top of the stack (n -- n)" label("forth.core.1-") adda(-add_cost_of_next(cost_of_decrement) / 2) # 1 ld(data_stack_page, Y) ld([data_stack_pointer], X) ld([Y, X]) beq(lo(".low-byte-was-zero")) # 5 suba(1) # 6 st([Y, X]) # 7 NEXT(cost_of_decrement__one_word_written) label(".low-byte-was-zero") st([Y, Xpp]) # 7 ld([Y, X]) suba(1) st([Y, X]) # 10 NEXT(cost_of_decrement__two_words_written)
def dup(): label("forth.core.DUP") adda(-add_cost_of_next(cost_of_dup) / 2) ld([data_stack_pointer], X) ld([X]) st([tmp0]) ld([data_stack_pointer]) # 5 adda(1, X) ld([X]) st([tmp1]) ld(data_stack_page, Y) ld([data_stack_pointer]) # 10 suba(2) st([data_stack_pointer], X) ld([tmp0]) st([Y, Xpp]) ld([tmp1]) # 15 st([Y, X]) # 16 NEXT(cost_of_dup)
def over(): label("forth.core.OVER") adda(-add_cost_of_next(cost_of_over) / 2) ld([data_stack_pointer]) adda(2, X) ld([X]) st([tmp0]) # 5 ld([data_stack_pointer]) adda(3, X) ld([X]) st([tmp1]) ld(data_stack_page, Y) # 10 ld([data_stack_pointer]) suba(2) st([data_stack_pointer], X) ld([tmp0]) st([Y, Xpp]) # 15 ld([tmp1]) st([Y, X]) # 17 NEXT(cost_of_over)
def do_docol_ram(): label("forth.DO-DOCOL-RAM") # Upon exit from this thread, we need to restore the mode # So the return stack needs to look like: # TOP-> [restore_mode, mode, IP] ld([return_stack_pointer]) # 1 suba(5) st([return_stack_pointer], X) st(lo("forth.RESTORE-MODE"), [Y, Xpp]) st(hi("forth.RESTORE-MODE"), [Y, Xpp]) # 5 ld([mode]) st([Y, Xpp]) ld([IP_lo]) st([Y, Xpp]) ld([IP_hi]) # 10 st([Y, X]) ld(lo("forth.next3.rom-mode")) st([mode]) # 13 _copy_W_to_IP(increment_by=8) NEXT(cost_of_docol_ram)
def next2(vTicks): label("forth.next2") label("forth.next2.odd") nop() label("forth.next2.even") # On entry AC holds the negative of the number of ticks taken by the just executed instruction # To have entered the instruction we must have also had a successful test, suba((cost_of_successful_test + cost_of_next2_success) / 2) # 1 adda([vTicks]) # 2 st([vTicks]) # 3; If we exit successfully we'll be ready for next1 ld([mode]) # 4 st([W_lo]) # 5 ld(hi("forth.next3")) # 6 # TODO st([W_hi]) # 7 ld([vTicks]) # 8 suba((cost_of_failed_test) / 2) # 9 blt(lo("forth.exit.from-next2")) # 10 tick_correction = cost_of_next2_success - cost_of_next2_failure ld(tick_correction / 2) # 11; Restore bra(lo("forth.next1")) # 12 ld([vTicks]) # 13
def lit_rom_mode(): """LIT - word that reads a number encoded in the thread, and pushes it to the stack""" label("forth.internal.LIT") adda(-(cost_of_lit_rom_mode // 2)) ld(-(cost_of_lit_rom_mode // 2)) C("Store cost") st([tmp0]) ld([data_stack_pointer]) C("Decrement Data stack pointer") suba(2) # 5 ld(AC, X) st([data_stack_pointer]) ld([IP_hi], Y) C("Jump to the code in the thread") ld(6) C("We're going to shift the IP by 6") nop( ) # 10, to meet requirement of move-ip that we must use an even number of cycles jmp(Y, [IP_lo]) ld(0x00, Y) # 12
val = math.floor(i**2 / 4) ld(hi(val)) C(f"${val:04x} = {val} = floor({i} ** 2 / 4); ${val:04x} >> 8 = ${val >> 8:02x}" ) # We jump back here after looking up the low-byte of the result. label("low-byte return point") ld(hi("multiply 7x7"), Y) jmp(Y, [continuation]) ld(hi(pc()), Y) # Make it easy to get back here! cost_of_low_byte_return = 3 label("table entry.possibly-negative") # AC is negative, if b > a. Find absolute value blt(pc() + 3) # 1 bra(pc() + 3) # 2 suba(1) # 3; if >= 0 xora(0xFF) # 3; if < 0 adda(1) # 4 cost_of_absolute = 4 label("table entry") # Calculate an index into the high-byte table. # This is basically a matter of subtracting 32, and jumping in if the result >= 0. # But values greater than 160 have the sign-bit set after subtraction, # despite being >32. # We test for the sign bit and jump after subtraction even if 'negative' in these cases. st([tmp]) # 1 blt(pc() + 5) # 2 suba(32) # 3 bge(AC) # 4 bra([high_byte_action]) # 5 ld(0) # 6
def _shift_entry(*, offset_to_amount_eq_8, offset_to_amount_gt_8, offset_to_amount_lt_8): # Structurally left and right shift are very similar, # and we can share a lot of code. # There are five major cases for each (n is the shift amount): # n == 0 : We don't do anything but adjust stack height. # 0 < n < 8 : The most complicated case - we need to shift both # bytes and also transfer bits from one to the other # n == 8 : Quite simple, one byte takes its value from the other # which becomes zero # 8 < n < 16 : Shift one byte, and store into the other. # Store zero in first byte. # 16 <= n : Result is zero (technically we could ignore this). # # These have very different costs! # The entry point for both LSHIFT and RSHIFT call a single routine. # It loads the shift amount, and works out which of the cases we're # in. n == 0, and n > 16 are both handled immediately, followed by # NEXT. # For the other three cases, we dispatch to different routines by # adjusting W and calling REENTER. # The code is structured so that the we need to apply to W is the # same whether we're doing a left or right shift. # LSHIFT and RSHIFT both begin with the following sequence # adda(-add_cost_of_next(cost_of_shift_entry) / 2) # 1 # ld(data_stack_page, Y) # 2 # ld([data_stack_pointer], X) # 3 # bra("forth.core.shift.entry") # 4 # ld([data_stack_pointer]) # 5 # (The loads of X and Y technically happen elsewhere, but we count # them here) label("forth.core.shift.entry") adda(2) # 6 st([data_stack_pointer]) # Load amount: ld([Y, X]) # Load low-byte of amount st([Y, Xpp]) st([amount]) # 10 ora([Y, X]) beq("forth.core.shift.entry.amount-zero") # 12 # Numbers greater than 16 must have bit 4 or higher set. # AND with 0xf0 will reveal high bits set. ld(0xF0) # 13; Test for 16s place or higher being set in low byte anda([amount]) ora([Y, X]) # 15; Or any bit in high byte bne("forth.core.shift.entry.amount-gte16") # 16 # We want different values depending on which path we're going to follow # the n < 8 case wants -(n) and -(8 - n) = n - 8. # The n > 8 case wants -(n - 8) = 8 - n # The n = 8 case needs nothing. # Because the < 8 case has two variables, give it the "default" path # TODO: I feel very deeply that there must be a nicer way of doing this # TODO: Probably something todo with XOR. ld([amount]) # 17 suba(8) bgt("forth.core.shift.entry.amount-gt8") # 19 beq("forth.core.shift.entry.amount-eq8") # 20 st([transfer_amount]) # 21 # For the n < 8 case ld(0) suba([amount]) st([amount]) ld(offset_to_amount_lt_8) # 25 label(".adjust_W") adda([W_lo]) # 26 st([W_lo]) # 27 REENTER(27) label("forth.core.shift.entry.amount-eq8") nop() # 22 nop() bra(lo(".adjust_W")) # 24 ld(offset_to_amount_eq_8) # 25 label("forth.core.shift.entry.amount-gt8") ld(8) # 21 suba([amount]) st([amount]) bra(".adjust_W") # 24 ld(offset_to_amount_gt_8) # 25 label("forth.core.shift.entry.amount-zero") NEXT(13) label("forth.core.shift.entry.amount-gte16") st([Y, Xpp]) # 18 ld(0) st([Y, Xpp]) # 20 st([Y, Xpp]) # 21 NEXT(21)
def add(): # This is exactly the same algorithm as in the vCPU implementation, but with my own comments to explain it to myself. label("forth.core.+") label("forth.core.CHAR+") adda(-add_cost_of_next(cost_of_add) / 2) # 1 low, high = tmp0, tmp1 ld(data_stack_page, Y) C("Load and move data stack pointer") ld([data_stack_pointer], X) ld([data_stack_pointer]) adda(2) # 5 st([data_stack_pointer]) # 6 # Copy TOS to low, high c = "Copy TOS to zero-page" for address in [low, high]: ld([Y, X]) c = C(c) st([address]) st([Y, Xpp]) # 12 = 6 + 2 * 3 # Add low bytes ld([Y, X]) C("Add low bytes") adda([low]) st([Y, Xpp]) # 15 bmi(".add.result-has-1-in-bit-7") suba([low]) # 17 Restore to low-byte of TOS # We previously had a result with a 0 in bit seven 0xxxxxxx # We can now use the operands to work out if there was # a carry out of bit seven. # The truth table is as follows # | A[7] | B[7] | Carry-in || Result[7] | Carry-out # ---|------------------------------------------------ # 0 | 0 | 0 | 0 || 0 | 0 # 1 | 0 | 0 | 1 || 1 | 0 # 2 | 0 | 1 | 0 || 1 | 0 # 3 | 0 | 1 | 1 || 0 | 1 # 4 | 1 | 0 | 0 || 1 | 0 # 5 | 1 | 0 | 1 || 0 | 1 # 6 | 1 | 1 | 0 || 0 | 1 # 7 | 1 | 1 | 1 || 1 | 1 # Given that there is zero in bit seven (cases 0, 3, 5 and 6) # There is not a carry (case 0) when both A[7] and B[7] are 0 # There is if either or both are 1. # Bitwise OR of the two operands will place the carry in bit seven bra(".add.carry-bit-in-msb") # 18 ora([low]) # 19 label(".add.result-has-1-in-bit-7") # Given that there is one in bit seven (cases 1, 2, 4 and 7) # There is not a carry (case 1, 2, 4) when either A[7] or B[7] are 0 # There is only a carry (case 7) when both are one. # Bitwise AND of the two operands will place the carry in bit seven bra(".add.carry-bit-in-msb") # 18 anda([low]) # 19 label(".add.carry-bit-in-msb") # vCPU moves uses anda $80, x to load 0x00 or 0x80 to X, and loads [X], # using constant values at 0x80 and 0x00, but we still need X for now, # So branching on the sign-bit works out just as cheap. bmi(".add.carry") ld([Y, X]) # 21 bra(".add.finish") adda([high]) # 23 label(".add.carry") adda(1) # 22 adda([high]) # 23 label(".add.finish") st([Y, X]) # 24 NEXT(cost_of_add)