forked from carlbordum/dansk.py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dansk.py
243 lines (205 loc) · 6.92 KB
/
dansk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""
dansk.py
~~~~~~~~
This module implements a decoder that translates danish words to
their well-known english Python equivalents. Also it registers a
codec called "dansk" on import.
Translations worked out in collaboration with Christoffer Moesgaard
and Robert Jensen.
"""
import argparse
import codecs
import functools
import io
from itertools import islice
import os
import pathlib
import sys
from tokenize import tokenize, untokenize, NAME, OP, TokenInfo
__version__ = "0.1.3"
_viking_to_english = {
"og": "and",
"som": "as",
# `hævd`, `påstå` og `postulèr` blev overvejet som
# alternativer, men `forvent` vandt, da `ForventningsFejl` giver
# mest mening som undtagelse.
"forvent": "assert",
# Valgt istedet for `asynk` med begrundelsen at den eneste grund
# til at det ikke er `asynchronous` er at ingen kan stave til det.
"asynkron": "async",
"afvent": "await",
"brud": "break",
# Hvis `klasse` ikke var et accepteret datalogisk begreb ville vi
# nok have valgt `slags` eller `art`.
"klasse": "class",
"fortsæt": "continue",
"lad": "def",
"slet": "del",
"ellers-hvis": "elif",
"ellers": "else",
"pånær": "except",
"slutteligt": "finally",
"for-hver": "for",
"fra": "from",
"altomfattende": "global",
"hvis": "if",
"indfør": "import",
"indeni": "in", # we don't want to disallow `i` as a variable name
"er": "is",
"λ": "lambda",
"omfattende": "nonlocal",
"ikke": "not",
"eller": "or",
"fisk": "pass",
"hejs": "raise",
"aflever": "return",
"forsøg": "try",
"medens": "while",
"brug": "with",
# Den her var virkelig svær. Der var mange gode bud, men i sidste
# ende må vi konstatere at `yd` er bedst, så længe man lytter til
# https://youtu.be/llyedDrGQkA, mens man skriver sine generatorer.
"yd": "yield",
"Falsk": "False",
"Sand": "True",
"Intet": "None",
}
def encode(string, errors="strict"):
# this could return the default encode, but I think
# encode(decode(source)) == source
# which is not possible for this encoding
# unless we don't allow regular python keywords at all
# hmmmm
raise NotImplemented()
def _token_iter(tokens):
"""Collapse tokens considered a single token in danish into one.
We need this function, because the tokenizer considers `ellers-hvis`
and `for-hver` as "ellers" MINUS "hvis" and "for" MINUS "hver"
respectively.
Send the tokenized danish code through this guy.
"""
if len(tokens) < 3:
yield from tokens
return
def _is_elif_token(t1, t2, t3):
return (
t1.type == NAME
and t1.string == "ellers"
and t2.type == OP
and t2.string == "-"
and t3.type == NAME
and t3.string == "hvis"
)
def _is_for_token(t1, t2, t3):
return (
t1.type == NAME
and t1.string == "for"
and t2.type == OP
and t2.string == "-"
and t3.type == NAME
and t3.string == "hver"
)
skip = 0 # skip this many tokens before proceding
for t1, t2, t3 in zip(tokens, islice(tokens, 1, None), islice(tokens, 2, None)):
if skip > 0:
skip -= 1
continue
if _is_elif_token(t1, t2, t3):
yield TokenInfo(NAME, "ellers-hvis", t1.start, t1.end, t1.line)
skip = 2
elif _is_for_token(t1, t2, t3):
yield TokenInfo(NAME, "for-hver", t1.start, t1.end, t1.line)
skip = 2
else:
yield t1
yield tokens[-2]
yield tokens[-1]
# pass True for ignore_first_line if the byteslike still contains the dansk
# encoding comment
def decode(byteslike, errors="replace", *, ignore_first_line):
read_code = io.BytesIO(bytes(byteslike)).readline
if ignore_first_line:
read_code() # its the encoding comment
tokens = list(tokenize(read_code))
new_tokens = []
for token in _token_iter(tokens):
if token.string in _viking_to_english:
new_tokens.append(
TokenInfo(
token.type,
_viking_to_english[token.string],
token.start,
token.end,
token.line,
) # its a copy with token.string replaced
)
else:
new_tokens.append(token)
return str(untokenize(new_tokens), "utf-8"), len(byteslike)
# codecs.BufferedIncrementalDecoder is undocumented, but well commented, and if
# it breaks, it's easy to copy/paste it here
class DanskIncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, data, errors, final):
if final:
# this thing is used when python has already removed the encoding
# comment, so tell our decode() to not do it
return decode(data, errors, ignore_first_line=False)
else:
return ("", 0)
codec_info = codecs.CodecInfo(
encode,
functools.partial(decode, ignore_first_line=True),
incrementaldecoder=DanskIncrementalDecoder,
name="dansk",
)
codecs.register({"dansk": codec_info}.get)
def finish_installation():
"""Place a .pth file in site-packages that loads dansk.py.
site.py executes *.pth files in site-packages when Python starts.
We need this because dansk.py has to be imported before our
beautiful ``# coding=dansk`` programs are run, so the decoder can
be found.
"""
def get_site_packages_path():
virtualenv = os.environ.get("VIRTUAL_ENV")
if virtualenv:
from distutils.sysconfig import get_python_lib
return get_python_lib()
import site
return site.getusersitepackages()
site_packages = pathlib.Path(get_site_packages_path())
# prefixed with zzz, because they are executed alphabetically and
# dansk.py has to be on path before it can be imported
dansk_pth = site_packages / "zzz_register_dansk_encoding.pth"
with open(dansk_pth, "w+") as f:
f.write("import dansk")
def main():
parser = argparse.ArgumentParser(
prog="dansk",
description="Python, men dansk.",
formatter_class=argparse.RawTextHelpFormatter,
add_help=False,
)
parser.add_argument(
"installér",
nargs=argparse.REMAINDER,
help="dansk.py indlæses når Python startes.",
)
parser.add_argument(
"-v",
"--version",
action="version",
version=__version__,
help="skriv version information og stop.",
)
parser.add_argument(
"-h", "--hjælp", action="help", help="vis denne besked og stop."
)
args = parser.parse_args()
if len(args.installér):
finish_installation()
else:
print("dansk: mangler kommando\nPrøv 'dansk --hjælp' for mere information.")
sys.exit(0)
if __name__ == "__main__":
main()