def create_cat(cat, catargs, extratext=None): global args if args.pos == "verb": pos = "verb" shortpos = "verb" elif args.pos == "adj": pos = "adjective" shortpos = "adj" elif args.pos == "noun": pos = "noun" shortpos = "noun" else: assert False, "Invalid pos %s" % args.pos cat = "Belarusian " + cat.replace("~", "%ss" % pos) text = "{{be-%s cat%s}}" % (shortpos, "".join("|" + arg for arg in catargs)) if extratext: text += "\n%s" % extratext num_pages = len(list(blib.cat_articles(cat))) if num_pages == 0: return cat = "Category:" + cat page = pywikibot.Page(site, cat) if not args.overwrite and page.exists(): msg("Page %s already exists, not overwriting" % cat) return page.text = unicode(text) changelog = "Creating '%s' with text '%s'" % (cat, text) msg("Changelog = %s" % changelog) if args.save: blib.safe_page_save(page, changelog, errandmsg)
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) if args.ignore_non_mainspace and ':' in pagetitle: return if not blib.safe_page_exists(page, pagemsg): pagemsg("WARNING: Page doesn't exist, null-saving it would create it") return # pagemsg("Null-saving") blib.safe_page_save(page, "null save", errandpagemsg)
def create_cat(cat, args, adj=False, verb=False): if verb: cat = "Category:Russian " + cat.replace("~", "verbs") text = "{{ruverbcatboiler}}" elif adj: cat = "Category:Russian " + cat.replace("~", "adjectives") text = "{{ruadjcatboiler|%s}}" % "|".join(args) else: cat = "Category:Russian " + cat.replace("~", "nouns") text = "{{runouncatboiler|%s}}" % "|".join(args) page = pywikibot.Page(site, cat) if not overwrite and page.exists(): msg("Page %s already exists, not overwriting" % cat) return page.text = unicode(text) changelog = "Creating '%s' with text '%s'" % (cat, text) msg("Changelog = %s" % changelog) if dosave: blib.safe_page_save(page, changelog, errandmsg)
def save_template_doc(tempname, doc, save): global nextpage msg("For [[Template:%s]]:" % tempname) msg("------- begin text --------") msg(doc.rstrip('\n')) msg("------- end text --------") comment = "Update form-of template documentation" nextpage += 1 def pagemsg(txt): msg("Page %s %s: %s" % (nextpage, tempname, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (nextpage, tempname, txt)) if save: page = pywikibot.Page(site, "Template:%s/documentation" % tempname) pagemsg("Saving with comment = %s" % comment) page.text = doc blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment)
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) errmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) if args.verbose: pagemsg("Processing") if page.exists(): errandpagemsg("Page already exists, not overwriting") return if not pagetitle.startswith("Category:"): pagemsg("Page not a category, skipping") return catname = re.sub("^Category:", "", pagetitle) if blacklist(catname): pagemsg("Category is blacklisted, skipping") return num_pages = len(list(blib.cat_articles(catname))) num_subcats = len(list(blib.cat_subcats(catname))) if num_pages == 0 and num_subcats == 0: pagemsg("Skipping empty category") return contents = u"{{auto cat}}" result = expand_text(contents) if not result: return if ("Category:Categories with invalid label" in result or "The automatically-generated contents of this category has errors" in result): pagemsg("Won't create page, would lead to errors: <%s>" % result) else: pagemsg("Creating page, output is <%s>" % result) comment = 'Created page with "%s"' % contents if args.save: page.text = contents if blib.safe_page_save(page, comment, errandpagemsg): errandpagemsg("Created page, comment = %s" % comment) else: pagemsg("Would create, comment = %s" % comment)
def process_page(page, index, args, contents): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) if args.verbose: pagemsg("Processing") if page.exists(): errandpagemsg("Page already exists, not overwriting") return comment = 'Created page with "%s"' % contents if args.save: page.text = contents if blib.safe_page_save(page, comment, errandpagemsg): errandpagemsg("Created page, comment = %s" % comment) else: pagemsg("Would create, comment = %s" % comment)
u"मिलाना", u"रखना", u"रहना", u"लगना", u"लगाना", u"लेना", u"सकना", u"समझना", u"सुनाना", u"सूखना", u"हिलाना", u"होना", ] parser = blib.create_argparser(u"Create Hindi phrasal verb categories") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) for cat in cats: pagename = "Category:Hindi phrasal verbs with particle (%s)" % cat page = pywikibot.Page(site, pagename) if page.exists(): msg("Page %s already exists, not overwriting" % pagename) continue text = "[[Category:Hindi phrasal verbs|%s]]" % cat page.text = text changelog = "Create '%s' with text '%s'" % (pagename, text) msg("Changelog = %s" % changelog) if args.save: blib.safe_page_save(page, changelog, errandmsg)
def process_page(index, num, save, verbose, params): comment = None notes = [] lemma = ru_num(num) pagetitle = rulib.remove_accents(lemma) newtext = generate_page(num) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) # Prepare to create page pagemsg("Creating entry") page = pywikibot.Page(site, pagetitle) # If invalid title, don't do anything. existing_text = blib.safe_page_text(page, errandpagemsg, bad_value_ret=None) if existing_text is None: return if not blib.safe_page_exists(page, errandpagemsg): # Page doesn't exist. Create it. pagemsg("Creating page") comment = "Create page for Russian numeral %s (%s)" % (lemma, num) page.text = newtext if verbose: pagemsg("New text is [[%s]]" % page.text) else: # Page does exist pagetext = existing_text # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Russian section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Russian": # Extract off trailing separator mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S) if mm: # Note that this changes the number of sections, which is seemingly # a problem because the for-loop above calculates the end point # at the beginning of the loop, but is not actually a problem # because we always break after processing the Russian section. sections[i:i + 1] = [mm.group(1), mm.group(2)] if params.overwrite_page: if "==Etymology 1==" in sections[ i] and not params.overwrite_etymologies: errandpagemsg( "WARNING: Found ==Etymology 1== in page text, not overwriting, skipping form" ) return else: pagemsg("WARNING: Overwriting entire Russian section") comment = "Create Russian section for numeral %s (%s)" % ( lemma, num) sections[i] = newtext notes.append("overwrite section") break else: errandpagemsg( "WARNING: Not overwriting existing Russian section") return elif m.group(1) > "Russian": pagemsg("Exists; inserting before %s section" % (m.group(1))) comment = "Create Russian section and entry for numeral %s (%s); insert before %s section" % ( lemma, num, m.group(1)) sections[i:i] = [newtext, "\n----\n\n"] break else: # else of for loop over sections, i.e. no break out of loop pagemsg("Exists; adding section to end") comment = "Create Russian section and entry for numeral %s (%s); append at end" % ( lemma, num) if sections: sections[-1] = ensure_two_trailing_nl(sections[-1]) sections += ["----\n\n", newsection] else: if not params.overwrite_page: notes.append("formerly empty") if pagehead.lower().startswith("#redirect"): pagemsg("WARNING: Page is redirect, overwriting") notes.append("overwrite redirect") pagehead = re.sub( r"#redirect *\[\[(.*?)\]\] *(<!--.*?--> *)*\n*", r"{{also|\1}}\n", pagehead, 0, re.I) elif not params.overwrite_page: pagemsg("WARNING: No language sections in current page") sections += [newsection] # End of loop over sections in existing page; rejoin sections newtext = pagehead + ''.join(sections) if page.text != newtext: assert comment or notes # Eliminate sequences of 3 or more newlines, which may come from # ensure_two_trailing_nl(). Add comment if none, in case of existing page # with extra newlines. newnewtext = re.sub(r"\n\n\n+", r"\n\n", newtext) if newnewtext != newtext and not comment and not notes: notes = ["eliminate sequences of 3 or more newlines"] newtext = newnewtext if page.text == newtext: pagemsg("No change in text") elif verbose: pagemsg("Replacing <%s> with <%s>" % (page.text, newtext)) else: pagemsg("Text has changed") page.text = newtext # Executed whether creating new page or modifying existing page. # Check for changed text and save if so. notestext = '; '.join(notes) if notestext: if comment: comment += " (%s)" % notestext else: comment = notestext if page.text != existing_text: if save: pagemsg("Saving with comment = %s" % comment) blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment)
def process_line(index, line, add_passive_of, override_etym, save, verbose): def error(text): errmsg("ERROR: Processing line: %s" % line) errmsg("ERROR: %s" % text) assert False def check_stress(word): word = re.sub(r"|.*", "", word) if word.startswith("-") or word.endswith("-"): # Allow unstressed prefix (e.g. разо-) and unstressed suffix (e.g. -овать) return if rulib.needs_accents(word, split_dash=True): error("Word %s missing an accent" % word) # Skip lines consisting entirely of comments if line.startswith("#"): return if line.startswith("!"): override_etym = True line = line[1:] # If the second element (the etymology) begins with raw:, allow spaces in the remainder to be # included as part of the second element. els = do_split(r"\s+", line, 1) if len(els) != 2: error("Expected two fields, saw %s" % len(els)) if not els[1].startswith("raw:"): els = do_split(r"\s+", line) # Replace _ with space and \u els = [el.replace("_", " ").replace(r"\u", "_") for el in els] if len(els) != 2: error("Expected two fields, saw %s" % len(els)) accented_term = els[0] term = rulib.remove_accents(accented_term) etym = els[1] pagetitle = term def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) # Handle etymology adjformtext = "" if etym == "?": error("Etymology consists of bare question mark") elif etym == "-": etymtext = "===Etymology===\n{{rfe|lang=ru}}\n\n" elif etym == "--": etymtext = "" elif re.search(r"^(part|adj|partadj)([fnp]):", etym): m = re.search(r"^(part|adj|partadj)([fnp]):(.*)", etym) forms = {"f":["nom|f|s"], "n":["nom|n|s", "acc|n|s"], "p":["nom|p", "in|acc|p"]} infleclines = ["# {{inflection of|lang=ru|%s||%s}}" % (m.group(3), form) for form in forms[m.group(2)]] if m.group(1) in ["adj", "partadj"]: adjinfltext = """===Adjective=== {{head|ru|adjective form|head=%s%s}} %s\n\n""" % (headterm, trtext, "\n".join(infleclines)) else: adjinfltext = "" if m.group(1) in ["part", "partadj"]: partinfltext = """===Participle=== {{head|ru|participle form|head=%s%s}} %s\n\n""" % (headterm, trtext, "\n".join(infleclines)) else: partinfltext = "" adjformtext = partinfltext + adjinfltext etymtext = "" else: if etym.startswith("acr:"): _, fullexpr, meaning = do_split(":", etym) etymtext = "{{ru-etym acronym of|%s||%s}}." % (fullexpr, meaning) elif etym.startswith("deverb:"): _, sourceterm = do_split(":", etym) etymtext = "Deverbal from {{m|ru|%s}}." % sourceterm elif etym.startswith("back:"): _, sourceterm = do_split(":", etym) etymtext = "{{back-form|lang=ru|%s}}" % sourceterm elif etym.startswith("raw:"): etymtext = re.sub(", *", ", ", re.sub("^raw:", "", etym)) elif ":" in etym and "+" not in etym: if etym.startswith("?"): prefix = "Perhaps borrowed from " etym = re.sub(r"^\?", "", etym) elif etym.startswith("<<"): prefix = "Ultimately borrowed from " etym = re.sub(r"^<<", "", etym) else: prefix = "Borrowed from " m = re.search(r"^([a-zA-Z.-]+):(.*)", etym) if not m: error("Bad etymology form: %s" % etym) etymtext = "%s{{bor|ru|%s|%s}}." % (prefix, m.group(1), m.group(2)) else: prefix = "" suffix = "" if etym.startswith("?"): prefix = "Perhaps from " suffix = "." etym = re.sub(r"^\?", "", etym) elif etym.startswith("<<"): prefix = "Ultimately from " suffix = "." etym = re.sub(r"^<<", "", etym) m = re.search(r"^([a-zA-Z.-]+):(.*)", etym) if m: langtext = "|lang1=%s" % m.group(1) etym = m.group(2) else: langtext = "" etymtext = "%s{{affix|ru|%s%s}}%s" % (prefix, "|".join(do_split(r"\+", re.sub(", *", ", ", etym))), langtext, suffix) etymbody = etymtext + "\n\n" etymtext = "===Etymology===\n" + etymbody if not etymtext: pagemsg("No etymology text, skipping") # Load page page = pywikibot.Page(site, pagetitle) if not blib.try_repeatedly(lambda: page.exists(), pagemsg, "check page existence"): pagemsg("Page doesn't exist, can't add etymology") return pagemsg("Adding etymology") notes = [] pagetext = unicode(page.text) # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Russian section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Russian": if override_etym: subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M) replaced_etym = False for j in xrange(2, len(subsections), 2): if "==Etymology==" in subsections[j - 1] or "==Etymology 1==" in subsections[j - 1]: subsections[j] = etymbody replaced_etym = True break if replaced_etym: sections[i] = "".join(subsections) newtext = "".join(sections) notes.append("replace Etymology section in Russian lemma with manually specified etymology") break if "==Etymology==" in sections[i] or "==Etymology 1==" in sections[i]: errandpagemsg("WARNING: Already found etymology, skipping") return subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M) insert_before = 1 if "===Alternative forms===" in subsections[insert_before]: insert_before += 2 subsections[insert_before] = etymtext + subsections[insert_before] sections[i] = "".join(subsections) if add_passive_of: active_term = rulib.remove_monosyllabic_accents( re.sub(u"с[яь]$", "", accented_term)) sections[i] = re.sub(r"(^(#.*\n)+)", r"\1# {{passive of|lang=ru|%s}}\n" % active_term, sections[i], 1, re.M) newtext = pagehead + "".join(sections) notes.append("add (manually specified) Etymology section to Russian lemma") break else: errandpagemsg("WARNING: Can't find Russian section, skipping") return if newtext != pagetext: if verbose: pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext)) assert notes comment = "; ".join(group_notes(notes)) if save: blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment)
def create_declension(page, index, save, pos, tempname, decltempname, sgnum, removeparams, is_proper=False): pagename = page.title() comments = [] def pgmsg(text): msg("Page %s %s: %s" % (index, pagename, text)) # Starts with definite article al- def starts_with_al(text): return re.match(ALIF_ANY + A + "?" + L, text) def sub_if(fr, to, text): if re.search(fr, text): return re.sub(fr, to, text) else: return "" # Remove definite article al- from text def remove_al(text): return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text) or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text) or text) # Remove definite article al- from transliterated text def remove_al_tr(text): return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or sub_if("^a?l-", "", text) or text) # Split off interwiki links at end m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$", page.text, re.S) if m: pagebody = m.group(1) pagetail = m.group(2) else: pagebody = page.text pagetail = "" # Split top-level sections (by language) splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M) # Extract off head and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Look for Arabic section for seci in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[seci], re.M) if not m: pgmsg("Can't find language name in text: [[%s]]" % (sections[seci])) elif m.group(1) == "Arabic": # Extract off trailing separator mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S) if mm: secbody = mm.group(1) sectail = mm.group(2) else: secbody = sections[seci] sectail = "" # Split into subsections based on headers subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) # Go through each subsection for j in xrange(len(subsections)): notes = [] def add_note(note): if note not in notes: notes.append(note) # Look for subsections matching the given POS if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]): # Call reorder_shadda here so the templates we work with have # shadda in correct order but we don't mess with other text to # avoid unnecessary saving parsed = blib.parse_text(reorder_shadda(subsections[j])) def pagemsg(text): pgmsg("%s: [[%s]]" % (text, subsections[j])) # Check for various conditions causing us to skip this entry and # not try to add a declension table # Skip declension if certain templates found in definition. # We don't check for {{alternative form of|...}}, because it's # used for e.g. different ways of spelling "camera" in Arabic, # some with -ā and some with -a, so we still want to create # declensions for those. altspelling_templates = [ temp for temp in parsed.filter_templates() if temp.name in ["alternative spelling of"] ] if len(altspelling_templates) > 0: pagemsg( "Alternative spelling redirect found in text, skipping" ) continue if pos == "Adjective": feminine_of_templates = [ temp for temp in parsed.filter_templates() if temp.name in ["feminine of"] ] if len(feminine_of_templates) > 0: pagemsg( "feminine-of template found for adjective, skipping" ) continue # Retrieve headword_template, make sure exactly one and it is the right type headword_templates = [ temp for temp in parsed.filter_templates() if temp.name in [ "ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun", "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl", "ar-noun-inf-cons", "ar-adj-inf-def", "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-in", "ar-adj-an" ] ] if len(headword_templates) == 0: pagemsg( "WARNING: Can't find headword template in text, skipping" ) continue if len(headword_templates) > 1: pagemsg( "WARNING: Found multiple headword templates in text, skipping" ) continue headword_template = headword_templates[0] if headword_template.name != tempname: pagemsg( "Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name)) continue def getp(param): return getparam(headword_template, param) # NOTE: We physically add and remove parameters from the headword # template to get the list of parameters to use in creating the # declension template. These changes don't get propagated to the # headword template because we don't convert the parsed text back # to a string. def putp(param, value): addparam(headword_template, param, value) head = getp("1") orighead = head # Check for declension already present if (j + 1 < len(subsections) and re.match( "^===+Declension===+\n", subsections[j + 1]) or j + 3 < len(subsections) and re.match("^===+Usage", subsections[j + 1]) and re.match("^===+Declension===+\n", subsections[j + 3])): pagemsg( "Declension already found for head %s, skipping" % head) continue # Check for cpl # FIXME: Convert cpl into pl and fpl if getp("cpl"): pagemsg( "WARNING: Headword template for head %s has cpl param in it, skipping" % (head)) continue # Check for empty head. If w/o explicit translit, skip; else, # fetch head from page title. if not head: if not getp("tr"): pagemsg( "WARNING: Headword template head is empty and without explicit translit, skipping" ) continue else: pagemsg( "Headword template head is empty but has explicit translit" ) add_note("empty head, using page name") head = pagename putp("1", head) # Try to handle cases with a modifier; we can't handle all of them yet headspace = False if ' ' in head: headspace = True words = re.split(r"\s", remove_links(head)) head = words[0] if len(words) > 2: pagemsg( "WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead) continue assert (len(words) == 2) # Check for params we don't yet know how to handle must_continue = False for badparam in [ "pl2", "pltr", "head2", "sing", "coll" ]: if getp(badparam): # FIXME pagemsg( "WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam)) must_continue = True break if must_continue: continue # Now check for various types of construction, all either # construct (ʾidāfa) or adjectival def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext): if word.endswith(undia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, undiatext)) return re.sub(undia + "$", "", word) if word.endswith(udia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, udiatext)) return re.sub(udia + "$", "", word) if re.search(DIACRITIC_ANY_BUT_SH + "$", word): pagemsg( "WARNING: Strange diacritic at end of %s %s" % (nomgen, word)) if word[0] == ALIF_WASLA: pagemsg( "Changing %s alif wasla to plain alif for %s" % (nomgen, word)) add_note( "changing %s alif wasla to plain alif" % (nomgen)) word = ALIF + word[1:] return word def remove_gen_i3rab(word): return remove_nom_gen_i3rab( word, "genitive", IN, "IN", I, "I") def remove_nom_i3rab(word): return remove_nom_gen_i3rab( word, "nominative", UN, "UN", U, "U") def remove_gen_i3rab_tr(word): return remove_nom_gen_i3rab( word, "genitive", "in", "in", "i", "i") def remove_nom_i3rab_tr(word): return remove_nom_gen_i3rab( word, "nominative", "un", "un", "u", "u") idafa = False word0al = starts_with_al(words[0]) word1al = starts_with_al(words[1]) words[0] = remove_al(words[0]) words[1] = remove_al(words[1]) putp("1", words[0]) putp("mod", words[1]) if word0al and word1al: pagemsg( "Headword template head %s has space in it and found definite adjective construction" % (orighead)) add_note( "modifier definite adjective construction") putp("state", "def") elif word0al and not word1al: pagemsg( "WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead)) continue elif is_proper: if words[0].endswith(ALIF) and word1al: pagemsg( "Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead)) add_note( "modifier proper noun + definite adjective construction" ) putp("state", "ind-def") elif remove_diacritics(words[0]) == u"جمهورية": if word1al: pagemsg( "Proper noun headword template head %s has space in it and found definite idafa" % (orighead)) add_note( "modifier definite idafa construction") idafa = True assert sgnum == "sg" idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(ALIF): pagemsg( "Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead)) add_note( "modifier proper-noun ind-def idafa construction" ) assert sgnum == "sg" idafaval = "ind-def" putp("idafa", idafaval) else: pagemsg( "WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead)) continue else: pagemsg( "WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead)) continue elif not word0al and word1al: # Found an ʾidāfa construction pagemsg( "Headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True idafaval = "def-" + sgnum if idafaval == "def-sg": idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(I + Y): pagemsg( "WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead)) continue elif words[1].endswith(I + Y + SH): pagemsg( "Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead)) add_note( "modifier indefinite nisba adjective construction" ) elif pagename in adjectival_phrases: pagemsg( "Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead)) add_note( "modifier indefinite adjective construction") else: pagemsg( "Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead)) add_note("modifier indefinite idafa construction") idafa = True putp("idafa", sgnum) # Now remove any i3rab diacritics putp("1", remove_nom_i3rab(getp("1"))) if idafa: putp("mod", remove_gen_i3rab(getp("mod"))) else: putp("mod", remove_nom_i3rab(getp("mod"))) # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg( "Headword template head %s has space in it and is plural" % (orighead)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg( "WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead)) continue putp("pl", getp("1")) putp("1", "-") if not idafa: putp("modpl", getp("mod")) putp("mod", "-") # Now check if lemma has plural specified elif getp("pl"): pls = re.split(r"\s", remove_links(getp("pl"))) assert (len(pls) == 2) pls[0] = remove_al(pls[0]) pls[1] = remove_al(pls[1]) putp("pl", remove_nom_i3rab(pls[0])) if not idafa: putp("modpl", remove_nom_i3rab(pls[1])) else: if pls[1] != getp("mod"): pagemsg( "FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod"))) # Now check if there's manual translit. We need to split the # manual translit in two and pair up manual translit with # corresponding Arabic words. But first remove -t indicating # construct state, and check to see if manual translit is # same as auto translit, in which case it's unnecessary. if getp("tr"): pagemsg( "Headword template head %s has space in it and manual translit" % (orighead)) trwords = re.split(r"\s", getp("tr")) assert (len(trwords) == 2) trwords[0] = remove_nom_i3rab_tr( remove_al_tr(trwords[0])) if idafa: trwords[1] = remove_gen_i3rab_tr( remove_al_tr(trwords[1])) else: trwords[1] = remove_nom_i3rab_tr( remove_al_tr(trwords[1])) # Remove any extraneous -t from translit, either from construct # state of from removal of i3rab in a feminine noun/adj. for i in [0, 1]: if words[i].endswith( TAM) and trwords[i].endswith("t"): trwords[i] = trwords[i][0:-1] if words[i].endswith( ALIF + TAM) and not trwords[i].endswith("h"): trwords[i] += "h" if ar_translit.tr(words[0]) != trwords[0]: pagemsg( "Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/manual translit") putp("1", "%s/%s" % (getp("1"), trwords[0])) else: pagemsg( "Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0])) add_note( "modified head w/ignored manual translit") if ar_translit.tr(words[1]) != trwords[1]: pagemsg( "Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/manual translit") putp("mod", "%s/%s" % (getp("mod"), trwords[1])) else: pagemsg( "Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/ignored manual translit") else: # no space in head, not dealing with a modifier # If has link in it, just remove it if '[' in head or ']' in head or '|' in head: pagemsg( "Headword template head %s has link in it" % (head)) add_note("removed links from head") head = remove_links(head) putp("1", head) # If starts with definite article, remove article from everything, # including transliterations, and set state=def if starts_with_al(head): pagemsg( "Headword template head %s starts with definite article" % (head)) add_note("definite lemma") head = remove_al(head) putp("1", head) putp("state", "def") # Also remove al- from remaining head and pl params def check_for_al(param): param = remove_links(param) value = getparam(headword_template, param) if value: if '[' in value or ']' in value or '|' in value: pagemsg( "Param %s value %s has link in it" % (param, value)) add_note("removed links from %s" % param) value = remove_links(value) putp(param, remove_al(value)) params_to_check = [ "pl", "sing", "coll", "pauc", "f", "fpl" ] for param in params_to_check: check_for_al(param) for i in xrange(2, 10): check_for_al("head%s" % i) for param in params_to_check: check_for_al("%s%s" % (param, i)) # Also remove al- from transliteration def check_for_al_tr(param): value = getparam(headword_template, param) if value: putp(param, remove_al_tr(value)) check_for_al("tr") for param in params_to_check: check_for_al("%str" % param) for i in xrange(2, 10): check_for_al("tr%s" % i) for param in params_to_check: check_for_al("%s%str" % (param, i)) elif is_proper: if head.endswith(ALIF): pagemsg( u"Headword template head %s ends in -ā" % (head)) putp("state", "ind-def") else: pagemsg( u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head)) continue if head.endswith(UN): pagemsg( "Headword template head %s ends with explicit i3rab (UN)" % (head)) add_note("head has explicit i3rab (UN)") # We don't continue here because we handle this case below elif head.endswith(U): pagemsg( "Headword template head %s ends with explicit i3rab (U)" % (head)) add_note("head has explicit i3rab (U)") # We don't continue here because we don't need to handle this case # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s is plural" % (head)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg( "WARNING: Headword template head %s has manual translit and is plural, skipping" % (head)) continue putp("pl", getp("1")) putp("1", "-") # Now fetch the parameters from the headword template, removing # any that we want to remove, removing the i3rab -UN ending, and # adding any specified manual translit as a / annotation. def param_should_be_removed(param): name = unicode(param.name) if name == "sc" and unicode(param.value) == "Arab": return True if name.endswith("tr"): return True for remove in removeparams: if name == remove: return True if re.match("^[a-z]+$", remove) and re.match( "^%s([0-9]+)?$" % remove, name): return True return False def remove_i3rab(param): text = unicode(param) if text.endswith(UN): pgmsg("Removing i3rab from %s: %s" % (text, unicode(headword_template))) add_note("removing i3rab") return re.sub(UN + "$", "", text) def trparam(name): if name == "1": return "tr" elif name.startswith("head"): return name.replace("head", "tr") else: return name + "tr" def process_param(param): arabic = remove_i3rab(param) # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in # to signal the strong plural. if arabic.endswith("=+"): newarabic = re.sub(r"=\+$", "=sp", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Value of - is used in ar-adj-in to signal an unknown # feminine plural. if arabic.endswith("=-"): newarabic = re.sub(r"=-$", "=?", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Don't process translit in modifier constructions, where the # translit is also processed. if not headspace: tr = getparam(headword_template, trparam(unicode(param.name))) if tr: return arabic + "/" + tr return arabic params = '|'.join([ process_param(param) for param in headword_template.params if not param_should_be_removed(param) ]) # For templates that automatically supply the masculine plural, # supply it here, too if not overridden. if tempname in [ "ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an" ] and not getp("pl"): params += '|pl=sp' # Separate off any [[Category: Foo]] declarators, insert before them m = re.match( r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$", subsections[j], re.S) if m: body = m.group(1) tail = m.group(2) else: body = subsections[j] tail = "" # Make sure there are two trailing newlines if body.endswith("\n\n"): pass elif body.endswith("\n"): body += "\n" else: body += "\n\n" body += (subsections[j - 1].replace(pos, "=Declension=") + "{{%s|%s}}\n\n" % (decltempname, params)) subsections[j] = body + tail comment = "added declension for %s %s" % ( tempname, remove_links(orighead) or "%s/%s" % (pagename, getp("tr"))) note = ', '.join(notes) if note: comment = "%s (%s)" % (comment, note) comments.append(comment) sections[seci] = ''.join(subsections) + sectail newtext = pagehead + ''.join(sections) + pagetail comment = '; '.join(comments) assert ((not comment) == (newtext == page.text)) if newtext != page.text: if verbose: msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext)) page.text = newtext msg("For page %s, comment = %s" % (pagename, comment)) if save: blib.safe_page_save(page, comment, errandmsg)