-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenise.py
executable file
·159 lines (125 loc) · 4.26 KB
/
tokenise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
import sys
test_strings = [
'I',
'II',
'IX',
'IV',
'MCMLXXXVII',
'IM',
]
conversions = {
'I' : 1,
'V' : 5,
'X' : 10,
'L' : 50,
'C' : 100,
'D' : 500,
'M' : 1000,
}
# Goal
# 'IM' -> ['D','CCCC','L','XXXX','V','IIII']
order = {
'I' : 0,
'V' : 1,
'X' : 2,
'L' : 3,
'C' : 4,
'D' : 5,
'M' : 6,
}
order_list = ['I','V','X','L','C','D','M']
def tokenise( input_str ):
""" Convert a string to baconical form """
# .^
# Is IVM allowed?
# Compare pair wise
# We are making the assumption there are no chains, eg:
# I X L would be a chain of two ascending pairs (IX,XL)
# In fact lets sanity check that
for idx in range(len(input_str)-2):
if order[input_str[idx]] < order[input_str[idx+1]]:
if order[input_str[idx+1]] < order[input_str[idx+2]]:
raise Exception("We don't support chained ascending pairs")
# Now convert any ascending pairs into tokens
output = []
idx = 0
while idx < len(input_str)-1:
if order[input_str[idx]] < order[input_str[idx+1]]:
# Ascending pair, replace with a token
output += [input_str[idx] + input_str[idx+1]]
idx += 2
else:
# Not an ascending pair, just copy over the character
output += [input_str[idx]]
idx += 1
if idx < len(input_str):
output += [input_str[idx]]
return output
def replace_token(token):
if len(token) == 2:
# Divide the value of the second character by the
# first, then use this to generate a number of
# output characters
count = conversions[token[1]] / conversions[token[0]]
return token[0] + (token[0] * count)
return token
def replace_token2(token):
""" Using integers, yes we're naughty """
if len(token) == 2:
value = conversions[token[1]] - conversions[token[0]]
# We cannot go smaller than token[0] or we'll create more
# ascending sequences, I think...
# Lets generate out the values
output = []
while value > conversions[token[0]]:
for key in order_list[::-1]:
if value - conversions[key] > 0:
output += key
value -= conversions[key]
break
output += token[0]
return ''.join(output)
else:
return token
from canon import canon
from add import add
def tokenise_str(input_str):
return ''.join([replace_token2(c) for c in tokenise(input_str)])
if __name__ == '__main__':
if len(sys.argv) == 1:
# Run the expansion demo
for t in test_strings:
print "Running conversion process for %s" % t
print "After tokenisation: %s" % tokenise(t)
simple_swap = [replace_token(c) for c in tokenise(t)]
print "After basic expansion: %s" % simple_swap
clever_swap = [replace_token2(c) for c in tokenise(t)]
print "After smart expansion: %s" % clever_swap
result = canon(''.join(clever_swap))
print "After canonicalisation: %s" % result
print
print
elif len(sys.argv) == 2:
t = sys.argv[1]
print "Running conversion process for %s" % t
print "After tokenisation: %s" % tokenise(t)
simple_swap = [replace_token(c) for c in tokenise(t)]
print "After basic expansion: %s" % simple_swap
clever_swap = [replace_token2(c) for c in tokenise(t)]
print "After smart expansion: %s" % clever_swap
result = canon(''.join(clever_swap))
print "After canonicalisation: %s" % result
print
print
elif len(sys.argv) == 3:
# Add two strings
input_a = sys.argv[1]
input_b = sys.argv[2]
print "First input %s simplifies to %s" % (input_a, tokenise_str(input_a))
print "Second input %s simplifies to %s" % (input_b, tokenise_str(input_b))
result = add(tokenise_str(input_a), tokenise_str(input_b))
print "Result after addition: %s" % result
print "Canonical result: %s" % canon(result)
else:
print "Something amusing an error message"