/
add_ru_relational_adjs_dims_to_nouns.py
262 lines (245 loc) · 10.7 KB
/
add_ru_relational_adjs_dims_to_nouns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pywikibot, re, sys, argparse
import blib
from blib import getparam, rmparam, tname, pname, msg, site
import rulib
def add_rel_adj_or_dim_to_noun_page(nounpage, index, new_adj_or_dims, param, desc):
notes = []
pagetitle = str(nounpage.title())
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagetitle, txt))
text = str(nounpage.text)
retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg)
if retval is None:
pagemsg("WARNING: Couldn't find Russian section for noun of %s %s" % (
desc, ",".join(new_adj_or_dims)))
return
sections, j, secbody, sectail, has_non_lang = retval
parsed = blib.parse_text(secbody)
head = None
for t in parsed.filter_templates():
tn = tname(t)
if tn in ["ru-noun+", "ru-proper noun+", "ru-noun", "ru-proper noun"]:
if head:
pagemsg("WARNING: Saw multiple heads %s and %s for noun of %s %s, not modifying" %
(str(head), str(t), desc, ",".join(new_adj_or_dims)))
return
head = t
if not head:
pagemsg("WARNING: Couldn't find head for noun of %s %s" % (desc, ",".join(new_adj_or_dims)))
return
orig_adjs_or_dims = blib.fetch_param_chain(head, param, param)
adjs_or_dims = blib.fetch_param_chain(head, param, param)
added_adjs_or_dims = []
for adj_or_dim in new_adj_or_dims:
if adj_or_dim in adjs_or_dims:
pagemsg("Already saw %s %s in head %s" % (desc, adj_or_dim, str(head)))
else:
adjs_or_dims.append(adj_or_dim)
added_adjs_or_dims.append(adj_or_dim)
if adjs_or_dims != orig_adjs_or_dims:
orighead = str(head)
blib.set_param_chain(head, adjs_or_dims, param, param)
pagemsg("Replaced %s with %s" % (orighead, str(head)))
notes.append("add %s=%s to Russian noun" % (param, ",".join(added_adjs_or_dims)))
secbody = str(parsed)
subsecs = re.split("(^==.*==\n)", secbody, 0, re.M)
for k in range(2, len(subsecs), 2):
if "==Derived terms==" in subsecs[k - 1] or "==Related terms==" in subsecs[k - 1]:
header = re.sub("=", "", subsecs[k - 1]).strip()
for adj_or_dim in adjs_or_dims:
def note_removed_text(m):
if m.group(1):
pagemsg("Removed '%s' term with gloss for noun of %s %s: %s" %
(header, desc, adj_or_dim, m.group(0)))
return ""
newsubsecsk = re.sub(r"\{\{[lm]\|ru\|%s((?:\|[^{}\n]*)?)\}\}" % adj_or_dim, note_removed_text, subsecs[k])
if newsubsecsk != subsecs[k]:
notes.append("remove %s %s from %s" % (desc, adj_or_dim, header))
subsecs[k] = newsubsecsk
subsecs[k] = re.sub(", *,", ",", subsecs[k])
# Repeat in case adjacent terms removed (unlikely though).
subsecs[k] = re.sub(", *,", ",", subsecs[k])
subsecs[k] = re.sub(" *, *$", "", subsecs[k], 0, re.M)
subsecs[k] = re.sub(r"^\* *, *", "* ", subsecs[k], 0, re.M)
subsecs[k] = re.sub(r"^\* *(\n|$)", "", subsecs[k], 0, re.M)
if re.search(r"^\s*$", subsecs[k]):
subsecs[k] = ""
subsecs[k - 1] = ""
secbody = "".join(subsecs)
secj = secbody + sectail
newsecj = re.sub(r"\n\n\n+", "\n\n", secj)
if newsecj != secj and not notes:
notes.append("eliminate sequences of 3 or more newlines")
secj = newsecj
sections[j] = secj
return "".join(sections), notes
def add_rel_adj_or_dim_to_noun(index, adjs_or_dims, noun, param, desc):
pagetitle = rulib.remove_accents(blib.remove_links(noun))
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagetitle, txt))
nounpage = pywikibot.Page(site, pagetitle)
if not blib.safe_page_exists(nounpage, pagemsg):
pagemsg("WARNING: Noun %s for %s %s doesn't exist" % (noun, desc, ",".join(adjs_or_dims))
return
def do_add_rel_adj_or_dim_to_noun_page(page, index, parsed):
return add_rel_adj_or_dim_to_noun_page(page, index, adjs_or_dims, param, desc)
blib.do_edit(nounpage, index, do_add_rel_adj_or_dim_to_noun_page, save=args.save, verbose=args.verbose,
diff=args.diff)
def process_section_for_relational_adj_snarf(index, pagetitle, text, is_multi_etym_section):
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagetitle, txt))
if not re.search(r"\{\{lb\|ru\|([^{}]*\|)*relational[|}]", text):
pagemsg("Not a relational adjective")
return
parsed = blib.parse_text(text)
adj = None
for t in parsed.filter_templates():
if tname(t) == "ru-adj":
if getparam(t, "head2"):
pagemsg("WARNING: Multihead relational adjective %s, skipping" % str(t))
return
newadj = getparam(t, "1") or pagetitle
if adj and adj != newadj:
pagemsg("WARNING: Saw multiple adjectives %s and %s on relational page, skipping: head=%s" %
(adj, newadj, str(t)))
return
if "[[" in newadj:
pagemsg("WARNING: Saw links in relational adjective %s, skipping: head=%s" % (
newadj, str(t)))
return
adj = newadj
subsecs = re.split("(^==.*==\n)", text, 0, re.M)
if is_multi_etym_section:
etymtext = subsecs[0]
else:
for k in range(2, len(subsecs), 2):
if "==Etymology==" in subsecs[k - 1]:
etymtext = subsecs[k]
break
else:
pagemsg("WARNING: Relational adjective %s but couldn't find etymology section" % adj)
return
parsed = blib.parse_text(etymtext)
for t in parsed.filter_templates():
tn = tname(t)
if tn in ["affix", "af", "suffix", "suf"] and getparam(t, "1") == "ru":
noun = getparam(t, "2")
if getparam(t, "lang1"):
pagemsg("WARNING: lang1= in affix template %s for relational adjective %s" % (str(t), adj))
elif noun.endswith("-"):
pagemsg("WARNING: Prefix %s found as putative source noun for relational adjective %s: affix template %s" %
(noun, adj, str(t)))
elif noun.startswith("-"):
pagemsg("WARNING: Suffix %s found as putative source noun for relational adjective %s: affix template %s" %
(noun, adj, str(t)))
elif not noun:
pagemsg("WARNING: Blank string found as putative source noun for relational adjective %s: affix template %s" %
(adj, str(t)))
elif tn in ["affix", "af"] and not getparam(t, "3").startswith("-"):
pagemsg("WARNING: Apparent compound etymology for relational adjective %s, skipping: affix template %s" %
(adj, str(t)))
elif tn in ["affix", "af"] and getparam(t, "3").endswith("-"):
pagemsg("WARNING: Infix %s, hence apparent compound etymology for relational adjective %s, skipping: affix template %s" %
(getparam(t, "3"), adj, str(t)))
else:
msg("%s ||| %s" % (adj, noun))
break
else:
pagemsg("WARNING: Relational adjective %s, found etymology section but not affix template" % adj)
def snarf_relational_adjs(index, pagetitle, text):
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagetitle, txt))
pagemsg("Processing")
#retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg)
#if retval is None:
# pagemsg("WARNING: Couldn't find Russian section")
# return
#sections, j, secbody, sectail, has_non_lang = retval
secbody = text
if "Etymology 1" in secbody:
etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
for k in range(2, len(etym_sections), 2):
process_section_for_relational_adj_snarf(index, pagetitle, etym_sections[k], True)
else:
process_section_for_relational_adj_snarf(index, pagetitle, secbody, False)
def process_section_for_diminutive_snarf(index, pagetitle, text, is_multi_etym_section):
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagetitle, txt))
def expand_text(tempcall):
return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)
parsed = blib.parse_text(text)
saw_dim = False
for t in parsed.filter_templates():
if tname(t) in ["diminutive of", "dim of", "endearing diminutive of"]:
saw_dim = True
if not saw_dim:
return
nount = None
saw_dim = False
for t in parsed.filter_templates():
if tname(t) in ["ru-noun+", "ru-noun"]:
if nount and not saw_dim:
pagemsg("WARNING: Saw multiple heads (first=%s, second=%s), the first of which may or may not be a diminutive" % (
str(nount), str(t)))
nount = t
saw_dim = False
if tname(t) in ["diminutive of", "dim of", "endearing diminutive of"]:
if not nount:
pagemsg("WARNING: Didn't see head for diminutive noun, skipping")
return
if tname(nount) == "ru-noun":
heads = blib.fetch_param_chain(nount, "1", "head")
else:
nounargs = rulib.fetch_noun_args(nount, expand_text, forms_only=True)
if not nounargs:
return
if "nom_sg" in nounargs:
heads = nounargs["nom_sg"].split(",")
else:
heads = nounargs["nom_pl"].split(",")
saw_dim = True
dimofs = blib.remove_links(getparam(t, "2"))
for dimof in re.split(", *", dimofs):
msg("%s ||| %s" % (",".join(heads), dimof))
def snarf_diminutives(index, pagetitle, text):
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagetitle, txt))
pagemsg("Processing")
#retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg)
#if retval is None:
# pagemsg("WARNING: Couldn't find Russian section")
# return
#sections, j, secbody, sectail, has_non_lang = retval
secbody = text
if "Etymology 1" in secbody:
etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
for k in range(2, len(etym_sections), 2):
process_section_for_diminutive_snarf(index, pagetitle, etym_sections[k], True)
else:
process_section_for_diminutive_snarf(index, pagetitle, secbody, False)
parser = blib.create_argparser("Snarf Russian relational adjectives or diminutives or add to corresponding noun",
include_pagefile=True, include_stdin=True)
parser.add_argument('--direcfile', help="File of adjectives/diminutives and nouns, from a previous run of same script")
parser.add_argument('--pos', help="Part of speech ('reladj' or 'dim')", required=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)
if args.direcfile:
for index, line in blib.iter_items_from_file(args.direcfile, start, end):
if " ||| " in line:
adj_or_dim, noun = re.split(r" \|\|\| ", line)
else:
adj_or_dim, noun = re.split(" ", line)
adjs_or_dims = adj_or_dim.split(",")
if args.pos == "reladj":
add_rel_adj_or_dim_to_noun(index, adjs_or_dims, noun, 'adj', 'relational adjective')
else:
add_rel_adj_or_dim_to_noun(index, adjs_or_dims, noun, 'dim', 'diminutive')
else:
def process_text_on_page(index, pagename, text):
if args.pos == "reladj":
snarf_relational_adjs(index, pagename, text)
else:
snarf_diminutives(index, pagename, text)
blib.do_pagefile_cats_refs(args, start, end, process_text_on_page, stdin=True)