-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_en_verb_conjugations.py
284 lines (265 loc) · 9.63 KB
/
add_en_verb_conjugations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pywikibot, re, sys, argparse
import blib
from blib import getparam, rmparam, msg, errandmsg, site, tname, pname
dont_singularize = {
"Browns",
"Cheerios",
"Jesus",
"Thames",
"Wheaties",
"arms",
"as",
"backwards",
"balls",
"bejeebers",
"blues",
"bourgeois",
"bowels",
"brains",
"breeches",
"bus",
"cahoots",
"chops",
"creeps",
"dickens",
"edgeways",
"gas",
"goods",
"guts",
"halfsies",
"has",
"his",
"hots",
"is",
"its",
"jeans",
"jim-jams",
"knickers",
"nuts",
"odds",
"pants",
"panties",
"pros",
"shits",
"shorts",
"this",
"trousers",
"upstairs",
"us",
"vitals",
"wits",
"yes",
"yours",
}
singularize_as_such = {
"cookies": "cookie",
"eyeteeth": "eyetooth",
"eye-teeth": "eye-tooth",
"feet": "foot",
"geese": "goose",
"halves": "half",
"lies": "lie",
"pies": "pie",
"presses": "press",
"stompies": "stompie",
"teeth": "tooth",
"torpedoes": "torpedo",
"walkies": "walkie",
}
def singularize(word):
if word in singularize_as_such:
singular = singularize_as_such[word]
if word.startswith(singular):
return "[[%s]]%s" % (singular, word[len(singular):])
else:
return "[[%s|%s]]" % (singular, word)
if word.endswith("ies"):
return "[[%s|%s]]" % (word[:-3] + "y", word)
if re.search("(ch|sh|x)es$", word):
return "[[%s]]es" % word[:-2]
m = re.search(r"(^.*)([bcdfgjklmnpqrtv])\2ing$", word) # not s z h w x y
if m:
return "[[%s]]%sing" % (m.group(1) + m.group(2), m.group(2))
m = re.search(r"(^[bcdfghjklmnpqrstvwxyz]*[aeiou][bcdfgjklmnpqrstvz])ing$", word) # not h w x y
if m:
return "[[%se|%sing]]" % (m.group(1), m.group(1))
if word.endswith("ing"):
return "[[%s]]ing" % word[:-3]
return "[[%s]]s" % word[:-1]
def singularizable(word):
return word in singularize_as_such or (
word.endswith("s") and not word.endswith("ss") and not word.endswith("'s") and
word not in dont_singularize) or (
not word.endswith("thing") and re.search("^.*[aeiou].*ing$", word)
)
def link(word):
if singularizable(word):
return singularize(word)
elif word == "the":
return word
else:
return "[[" + word + "]]"
def canonicalize_existing_linked_head(head, pagemsg, link_the=False):
head = head.replace("’", "'").replace("[[one's|one's]]", "[[one's]]").replace("[['s|'s]]", "[['s]]")
words = re.split(r"((?:\[\[.*?\]\]|[^ \[\]])+)", head)
modwords = []
for word in words:
if word == "[[one]][['s]]":
modwords.append("[[one's]]")
elif word == "[[someone]][['s]]":
modwords.append("[[someone's]]")
elif not link_the and (word in ["the", "[[the]]"]):
modwords.append("the")
elif word and "[" not in word and "]" not in word and " " not in word:
modwords.append("[[%s]]" % word)
else:
modwords.append(word)
retval = "".join(modwords)
retval = re.sub(r"^\[\[to\]\] ", "", retval)
if head:
pagemsg("Canonicalized %s to %s" % (head, retval))
return retval
def process_text_on_page(index, pagename, text, verbs):
global args
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagename, txt))
def errandpagemsg(txt):
errandmsg("Page %s %s: %s" % (index, pagename, txt))
pagemsg("Processing")
notes = []
if args.mode == "full-conj":
if pagename not in verbs:
pagemsg("WARNING: Couldn't find entry for pagename")
return
parsed = blib.parse_text(text)
for t in parsed.filter_templates():
tn = tname(t)
origt = str(t)
if tn == "head" and getparam(t, "1") == "en" and getparam(t, "2") == "verb":
if getparam(t, "3"):
pagemsg("WARNING: Already has 3=, not touching: %s" % str(t))
continue
blib.set_template_name(t, "en-verb")
t.add("1", verbs[pagename])
rmparam(t, "2")
notes.append("convert {{head|en|verb}} of multiword expression to {{en-verb}}")
if origt != str(t):
pagemsg("Replaced %s with %s" % (origt, str(t)))
else:
first, rest = pagename.split(" ", 1)
if first not in verbs:
pagemsg("WARNING: Couldn't find entry for first=%s" % first)
return
parsed = blib.parse_text(text)
for t in parsed.filter_templates():
tn = tname(t)
origt = str(t)
if tn == "head" and getparam(t, "1") == "en" and getparam(t, "2") == "verb":
if getparam(t, "3"):
pagemsg("WARNING: Already has 3=, not touching: %s" % str(t))
continue
blib.set_template_name(t, "en-verb")
done = False
words = pagename.split(" ")
plural = False
for word in words:
if singularizable(word):
plural = True
break
if plural:
if verbs[first].startswith("<"):
restwords = []
for word in words[1:]:
restwords.append(link(word))
param1 = "[[%s]]%s %s" % (first, verbs[first], " ".join(restwords))
head_from_param = re.sub("<.*?>", "", param1)
existing_head = getparam(t, "head")
canon_existing_head = canonicalize_existing_linked_head(existing_head, pagemsg)
if canon_existing_head == head_from_param:
pagemsg("Removing existing head %s" % existing_head)
rmparam(t, "head")
t.add("1", param1)
done = True
elif canon_existing_head != existing_head:
pagemsg("Replacing existing head %s with canonicalized %s" % (existing_head, canon_existing_head))
t.add("head", canon_existing_head)
pagemsg("WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s" %
(canon_existing_head, head_from_param, origt))
elif existing_head:
pagemsg("WARNING: Existing head not removed (different from head-from-param %s): %s" %
(head_from_param, origt))
else:
t.add("1", param1)
done = True
else:
t.add("1", verbs[first])
headwords = []
for word in words:
if not headwords: # first word
headwords.append("[[" + word + "]]")
else:
headwords.append(link(word))
head_from_param = " ".join(headwords)
existing_head = getparam(t, "head")
canon_existing_head = canonicalize_existing_linked_head(existing_head, pagemsg)
if canon_existing_head == head_from_param:
pagemsg("Removing existing head %s" % existing_head)
rmparam(t, "head")
elif canon_existing_head != existing_head:
pagemsg("Replacing existing head %s with canonicalized %s" % (existing_head, canon_existing_head))
t.add("head", canon_existing_head)
pagemsg("WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s" %
(canon_existing_head, head_from_param, origt))
elif existing_head:
pagemsg("WARNING: Existing head not removed (different from head-from-param %s): %s" %
(head_from_param, origt))
else:
t.add("head", head_from_param)
done = True
if not done:
existing_head = getparam(t, "head")
if existing_head:
head_from_param = " ".join("[[%s]]" % word if word != "the" else word for word in pagename.split(" "))
canon_existing_head = canonicalize_existing_linked_head(existing_head, pagemsg)
if canon_existing_head == head_from_param:
pagemsg("Removing existing head %s" % existing_head)
rmparam(t, "head")
elif canon_existing_head != existing_head:
pagemsg("Replacing existing head %s with canonicalized %s" % (existing_head, canon_existing_head))
t.add("head", canon_existing_head)
pagemsg("WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s" %
(canon_existing_head, head_from_param, origt))
else:
pagemsg("WARNING: Existing head not removed (different from head-from-param %s): %s" %
(head_from_param, origt))
if verbs[first].startswith("<"):
t.add("1", "%s%s %s" % (first, verbs[first], rest))
else:
t.add("1", verbs[first])
rmparam(t, "2")
notes.append("convert {{head|en|verb}} of multiword expression to {{en-verb}}")
if origt != str(t):
pagemsg("Replaced %s with %s" % (origt, str(t)))
return str(parsed), notes
parser = blib.create_argparser("Convert {{head|en|verb}} to {{en-verb}} with specified conjugation",
include_pagefile=True, include_stdin=True)
parser.add_argument("--direcfile", help="File of conjugated verbs")
parser.add_argument("--mode", choices=["full-conj", "single-word"], help="Operating mode. If 'full-conj', --direcfile contains full conjugations with <>. If 'single-word', --direcfile contains the first word followed by the conjugation of that word.")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)
verbs = {}
for line in blib.yield_items_from_file(args.direcfile):
if args.mode == "full-conj":
verb = re.sub("<.*?>", "", line)
verbs[verb] = line
else:
if " " not in line:
msg("WARNING: No space in line: %s" % line)
continue
verb, spec = line.split(" ", 1)
verbs[verb] = spec
def do_process_text_on_page(index, pagename, text):
return process_text_on_page(index, pagename, text, verbs)
blib.do_pagefile_cats_refs(args, start, end, do_process_text_on_page, edit=True, stdin=True)