def check_for_al(param):
   param = remove_links(param)
   value = getparam(headword_template, param)
   if value:
     if '[' in value or ']' in value or '|' in value:
       pagemsg("Param %s value %s has link in it" % (param, value))
       add_note("removed links from %s" % param)
       value = remove_links(value)
     putp(param, remove_al(value))
 def check_for_al(param):
     param = remove_links(param)
     value = getparam(headword_template, param)
     if value:
         if '[' in value or ']' in value or '|' in value:
             pagemsg(
                 "Param %s value %s has link in it"
                 % (param, value))
             add_note("removed links from %s" %
                      param)
             value = remove_links(value)
         putp(param, remove_al(value))
Beispiel #3
0
 def canonicalize_existing(forms):
     forms = [re.sub(" '*or'* ", ",", form) for form in forms]
     forms = [
         splitform for form in forms
         for splitform in form.split(",")
     ]
     return [blib.remove_links(form) for form in forms if form]
Beispiel #4
0
def process_section(index, pagetitle, sectext):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    parsed = blib.parse_text(sectext)
    head = None
    for t in parsed.filter_templates():
        newhead = get_head_param(t, pagetitle)
        if newhead is not None:
            newhead = [blib.remove_links(x) for x in newhead]
            if head and head != newhead:
                pagemsg("WARNING: Saw multiple heads %s and %s" %
                        (",".join(head), ",".join(newhead)))
            head = newhead
    if not head:
        pagemsg("WARNING: Couldn't find head")
    saw_pronun = False
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "IPA":
            if getparam(t, "1") != "ang":
                pagemsg("WARNING: Wrong-language IPA template: %s" %
                        unicode(t))
                continue
            pagemsg("<from> %s <to> {{ang-IPA|%s}} <end>" %
                    (unicode(t), "|".join(head) or "<<%s>>" % pagetitle))
            saw_pronun = True
        elif tn == "ang-IPA":
            pagemsg("Saw existing pronunciation: %s" % unicode(t))
            saw_pronun = True
    if not saw_pronun:
        pagemsg(
            "WARNING: Didn't see pronunciation for headword %s <new> {{ang-IPA|%s}} <end>"
            % (",".join(head), "|".join(head)))
Beispiel #5
0
 def add(val, tr, is_lemma):
     val_to_add = blib.remove_links(val)
     # Remove monosyllabic accents to correctly handle the case of
     # рад, which has some heads with an accent and some without.
     val_to_add, tr = remove_monosyllabic_accents(
         val_to_add, tr)
     this_heads.add((val_to_add, tr, is_lemma))
def get_headword_pronuns(parsed, pagetitle, pagemsg, expand_text):
  # Get the headword pronunciation(s)
  headword_pronuns = []

  for t in parsed.filter_templates():
    tn = tname(t)
    if tn == "la-letter" or tn == "head" and getparam(t, "1") == "la" and getparam(t, "2") == "letter":
      pagemsg("WARNING: Skipping page with letter headword")
      return None
    if lalib.la_template_is_head(t):
      headword_pronuns.extend(lalib.la_get_headword_from_template(t, pagetitle, pagemsg, expand_text))

  # Canonicalize by removing links and final !, ?
  headword_pronuns = [re.sub("[!?]$", "", blib.remove_links(x)) for x in headword_pronuns]

  #for pronun in headword_pronuns:
  #  if lalib.remove_macrons(pronun) != pagetitle:
  #    pagemsg("WARNING: Headword pronun %s doesn't match page title, skipping" % pronun)
  #    return None

  # Check for acronym/non-syllabic.
  for pronun in headword_pronuns:
    if lalib.is_nonsyllabic(pronun):
      pagemsg("WARNING: Pronunciation is non-syllabic, skipping: %s" % pronun)
      return None
    if re.search("[" + lalib.uppercase + "][" + lalib.combining_accent_str + "]?[" + lalib.uppercase + "]", pronun):
      pagemsg("WARNING: Pronunciation may be an acronym, please check: %s" % pronun)

  headword_pronuns = remove_list_duplicates(headword_pronuns)
  if len(headword_pronuns) < 1:
    pagemsg("WARNING: Can't find headword template")
    return None
  return headword_pronuns
def clean(value):
    value = value.strip()
    value = remove_links(value)
    value = re.sub(", +", ",", value)
    if value == "-":
        value = ""
    return value
Beispiel #8
0
def process_text_on_page_for_full_conj(index, pagename, text, verbs):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagename, txt))

  pagemsg("Processing")

  notes = []

  if pagename not in verbs:
    pagemsg("WARNING: Can't find entry, skipping")
    return

  entry = verbs[pagename]
  origentry = entry
  first, rest = pagename.split(" ", 1)
  restwords = rest.split(" ")
  def_link = "%s<> %s" % (first, " ".join("[[%s]]" % word for word in restwords))
  if def_link == entry:
    pagemsg("Replacing entry '%s' with a blank entry because it's the default" % entry)
    entry = ""
  elif re.sub("<.*?>", "<>", entry) == def_link:
    newentry = blib.remove_links(entry)
    pagemsg("Replacing entry '%s' with entry without links '%s'" % (entry, newentry))
    entry = newentry

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "es-verb":
      if not getparam(t, "attn"):
        pagemsg("Didn't see attn=1: %s" % unicode(t))
        continue
      rmparam(t, "attn")
      if entry:
        t.add("1", entry)
        notes.append("add conjugation '%s' to Spanish verb" % entry)
      else:
        notes.append("add conjugation (default) to Spanish verb")
    if tn == "head" and getparam(t, "1") == "es" and getparam(t, "2") == "verb":
      head = getparam(t, "head")
      if head:
        pagemsg("WARNING: Removing head=%s compared with entry '%s', original entry '%s': %s" %
            (head, entry, origentry, unicode(t)))
        rmparam(t, "head")
      rmparam(t, "2")
      rmparam(t, "1")
      blib.set_template_name(t, "es-verb")
      if entry:
        t.add("1", entry)
        notes.append("convert {{head|es|verb}} to {{es-verb|%s}}" % entry)
      else:
        notes.append("convert {{head|es|verb}} to {{es-verb}}")
    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  return unicode(parsed), notes
Beispiel #9
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not args.stdin:
        pagemsg("Processing")

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None
    sections, j, secbody, sectail, has_non_latin = retval
    parsed = blib.parse_text(secbody)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in lalib.la_headword_templates:
            for head in lalib.la_get_headword_from_template(
                    t, pagetitle, pagemsg):
                no_macrons_head = remove_macrons(blib.remove_links(head))
                if pagetitle.startswith("Reconstruction"):
                    unprefixed_title = "*" + re.sub(".*/", "", pagetitle)
                else:
                    unprefixed_title = pagetitle
                if no_macrons_head != unprefixed_title:
                    pagemsg("WARNING: Bad Latin head: %s" % unicode(t))
    return None, None
Beispiel #10
0
def clean(value):
  value = value.strip()
  value = remove_links(value)
  value = re.sub(", +", ",", value)
  if value == "-":
    value = ""
  return value
def check_need_accent(text):
  for word in re.split(" +", text):
    word = blib.remove_links(word)
    if u"\u0301" in word or u"ё" in word:
      continue
    if not ru.is_monosyllabic(word):
      return True
  return False
Beispiel #12
0
def process_page(page, index):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if " " in pagetitle:
        pagemsg("WARNING: Space in page title, skipping")
        return
    pagemsg("Processing")

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn == "la-adv":
                adv = blib.remove_links(getparam(t, "1")) or pagetitle
                macron_stem, is_stem = lalib.infer_adv_stem(adv)
                if not is_stem:
                    pagemsg(
                        "WARNING: Couldn't infer stem from adverb %s, not standard: %s"
                        % (adv, origt))
                    continue
                adv_defns = lalib.find_defns(subsections[k])
                possible_adjs = []
                stem = lalib.remove_macrons(macron_stem)
                possible_adjs.append(stem + "us")
                possible_adjs.append(stem + "is")
                if stem.endswith("nt"):
                    possible_adjs.append(stem[:-2] + "ns")
                if stem.endswith("plic"):
                    possible_adjs.append(stem[:-2] + "ex")
                if stem.endswith("c"):
                    possible_adjs.append(stem[:-1] + "x")
                if re.search("[aeiou]r$", stem):
                    possible_adjs.append(stem)
                elif stem.endswith("r"):
                    possible_adjs.append(stem[:-1] + "er")
                if adv.endswith(u"iē"):
                    possible_adjs.append(stem + "ius")
                for possible_adj in possible_adjs:
                    investigate_possible_adj(index, possible_adj, adv,
                                             adv_defns)
 def fetch(param):
   val = getparam(t, param).strip()
   val = blib.remove_links(val)
   vals = re.split(r",\s*", val)
   retval = []
   for v in vals:
     # Remove final footnote symbols are per [[Module:table tools]]
     v = re.sub(ur"[*~@#$%^&+0-9_\u00A1-\u00BF\u00D7\u00F7\u2010-\u2027\u2030-\u205E\u2070-\u20CF\u2100-\u2B5F\u2E00-\u2E3F]*$", "", v)
     retval.append(uk.add_monosyllabic_stress(v))
   return ", ".join(retval)
Beispiel #14
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "vi-hantu":
            if not one_char(pagetitle):
                pagemsg("WARNING: Length of page title is %s > 1, skipping" %
                        len(pagetitle))
                continue
            if getparam(t, "pos"):
                pagemsg("WARNING: Saw pos=, skipping: %s" % unicode(t))
                continue
            chu = getparam(t, "chu")
            if chu and chu != "Nom":
                pagemsg("WARNING: Saw chu=%s not 'Nom', skipping: %s" %
                        (chu, unicode(t)))
                continue
            if chu == "Nom":
                newparam = "nom"
            else:
                newparam = "reading"
            reading = blib.remove_links(getparam(t, "1"))
            if not reading:
                pagemsg("WARNING: Empty reading, skipping: %s" % unicode(t))
                continue
            must_continue = False
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "rs", "chu"]:
                    pagemsg(
                        "WARNING: Unrecognized parameter %s=%s, skipping: %s" %
                        (pn, unicode(param.value), unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue
            t.add(newparam, reading, before="1")
            rmparam(t, "1")
            blib.set_template_name(t, "vi-readings")
            notes.append("{{vi-hantu}} -> {{vi-readings}}")

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #15
0
def get_lemmas_of_form_page(parsed):
  lemmas = set()
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    first_param = None
    if (tname in ["inflection of", "comparative of", "superlative of"]):
      first_param = get_first_param(t)
    if first_param:
      lemma = lalib.remove_macrons(blib.remove_links(getparam(t, first_param)))
      lemmas.add(lemma)
  return lemmas
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    origtext = text
    parsed = blib.parse_text(text)
    head = None
    for t in parsed.filter_templates():
        tn = tname(t)
        newhead = None
        if tn == "head" and getparam(t, "1") == "ang" or tn in [
                "ang-noun", "ang-noun-form", "ang-verb", "ang-verb-form",
                "ang-adj", "ang-adj-form", "ang-adv", "ang-con", "ang-prep",
                "ang-prefix", "ang-proper noun", "ang-suffix"
        ]:
            newhead = getparam(t, "head") or pagetitle
        if newhead:
            if head:
                pagemsg("WARNING: Saw head=%s and newhead=%s, skipping" %
                        (head, newhead))
                return
            head = newhead
    if u"ƿ" not in head:
        pagemsg("WARNING: Something wrong, didn't see wynn in head: %s" % head)
    saw_altspell = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "alternative spelling of":
            if saw_altspell:
                pagemsg(
                    "WARNING: Saw multiple {{alternative spelling of}}, skipping: %s and %s"
                    % (unicode(saw_altspell), unicode(t)))
                return
            saw_altspell = unicode(t)
            if getparam(t, "1") != "ang":
                pagemsg(
                    "WARNING: {{alternative spelling of}} without language 'ang', skipping: %s"
                    % unicode(t))
                return
            param2 = getparam(t, "2")
            should_param2 = blib.remove_links(head).replace(u"ƿ", "w")
            if param2 != should_param2:
                origt = unicode(t)
                t.add("2", should_param2)
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append(
                    "fix 2= in {{alternative spelling of}} in wynn Old English entries"
                )
    text = re.sub("\n\n+", "\n\n", unicode(parsed))
    if origtext != text and not notes:
        notes.append("condense 3+ newlines to 2")
    return text, notes
Beispiel #17
0
def hi_adj_is_indeclinable(t, pagetitle):
    if tname(t) == "hi-adj":
        pagename = blib.remove_links(getparam(t, "head") or pagetitle)
        # If the lemma doesn't end with any of the declinable suffixes, it's
        # definitely indeclinable. Some indeclinable adjectives end with these
        # same suffixes, but we have no way to know that these are indeclinable,
        # so assume declinable.
        return not (pagename.endswith(AA) or pagename.endswith(IND_AA) or
                    pagename.endswith(AA + M) or pagename.endswith(IND_AA + M)
                    or pagename.endswith(AA + N)
                    or pagename.endswith(IND_AA + N))
    return False
Beispiel #18
0
def tr(text, lang=None, sc=None, msgfun=msg):
    text = remove_links(text)
    text = tr_canonicalize_greek(text)

    text = rsub(text, u"γ([γκξχ])", r"n\1")
    text = rsub(text, u"ρρ", "rrh")

    text = rsub(text, '.', tt)

    # compose accented characters, fix hA and similar
    text = tr_canonicalize_latin(text)

    return text
Beispiel #19
0
def tr(text, lang=None, sc=None, msgfun=msg):
    text = remove_links(text)
    text = tr_canonicalize_greek(text)

    text = rsub(text, u"γ([γκξχ])", r"n\1")
    text = rsub(text, u"ρρ", "rrh")

    text = rsub(text, '.', tt)

    # compose accented characters, fix hA and similar
    text = tr_canonicalize_latin(text)

    return text
 def check_lemma(lemma):
     lemma = blib.remove_links(lemma)
     if lemma in northern_kurdish_lemmas:
         return "kmr", "Northern Kurdish", "existence of Northern Kurdish lemma"
     elif lemma in central_kurdish_lemmas:
         return "ckb", "Central Kurdish", "existence of Central Kurdish lemma"
     elif lemma in known_northern_kurdish_terms:
         return "kmr", "Northern Kurdish", "Kurdish Wiktionary"
     elif lemma in known_central_kurdish_terms:
         return "ckb", "Central Kurdish", "Kurdish Wiktionary"
     elif re.search("^[%s]" % arabic_charset, lemma):
         return "ckb", "Central Kurdish", "Arabic charset"
     else:
         return "kmr", "Northern Kurdish", "Latin charset"
Beispiel #21
0
def hi_lemma_is_indeclinable(t, pagetitle, pagemsg):
    if tname(t) in ["hi-noun", "hi-proper noun"]:
        return not not getparam(t, "ind")
    if tname(t) == "hi-adj":
        if getparam(t, "ind"):
            return True
        pagename = blib.remove_links(getparam(t, "head") or pagetitle)
        # If the lemma doesn't end with any of the declinable suffixes, it's
        # definitely indeclinable. Some indeclinable adjectives end with these
        # same suffixes, but we have no way to know that these are indeclinable,
        # so assume declinable.
        return not (pagename.endswith(AA) or pagename.endswith(IND_AA)
                    or pagename.endswith(AA + M))
    return False
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []
  adjval = None
  numval = None
  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-adj":
      adjval = blib.remove_links(getparam(t, "1"))
    if (unicode(t.name) == "head" and getparam(t, "1") == "ru" and
        getparam(t, "2") == "numeral"):
      numval = blib.remove_links(getparam(t, "head"))
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) == "ordinalbox" and getparam(t, "1") == "ru":
      if not adjval:
        pagemsg("WARNING: Can't find accented ordinal form")
      elif adjval != pagetitle:
        t.add("alt", adjval)
        notes.append("Add alt=%s to ordinalbox" % adjval)
    if unicode(t.name) == "cardinalbox" and getparam(t, "1") == "ru":
      if not numval:
        pagemsg("WARNING: Can't find accented cardinal form")
      elif numval != pagetitle:
        t.add("alt", numval)
        notes.append("Add alt=%s to cardinalbox" % numval)
      if "[[Category:Russian cardinal numbers]]" not in unicode(parsed):
        pagemsg("WARNING: Numeral not in [[Category:Russian cardinal numbers]]")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return parsed, notes
Beispiel #23
0
def replace_decl(page, index, parsed, decl, declforms):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  pagemsg("Processing decl %s" % decl)
  notes = []
  for t in parsed.filter_templates():
    tn = tname(t)
    forms = {}

    if tn == args.lang + "-decl-noun":
      number = ""
      getslots = uk_decl_noun_slots if args.lang == "uk" else be_decl_noun_slots
    elif tn == args.lang + "-decl-noun-unc":
      number = "sg"
      getslots = uk_decl_noun_unc_slots if args.lang == "uk" else be_decl_noun_unc_slots
    elif tn == args.lang + "-decl-noun-pl":
      number = "pl"
      getslots = uk_decl_noun_pl_slots if args.lang == "uk" else be_decl_noun_pl_slots
    else:
      continue

    i = 1
    for slot in getslots:
      if slot:
        form = getparam(t, i).strip()
        if not form:
          continue
        form = blib.remove_links(form)
        # eliminate spaces around commas
        form = re.sub(r"\s*,\s*", ",", form)
        slotforms = form.split(",")
        slotforms = [
            (uk.add_monosyllabic_stress(f) if args.lang == "uk" else be.add_monosyllabic_accent(f))
            for f in slotforms
          ]
        forms[slot] = ",".join(slotforms)
      i += 1

    if compare_forms(forms, declforms, pagemsg):
      origt = unicode(t)
      t.name = args.lang + "-ndecl"
      del t.params[:]
      t.add("1", decl)
      newt = unicode(t)
      pagemsg("Replaced %s with %s" % (origt, newt))
      notes.append("replace {{%s|...}} with %s" % (tn, newt))

  return unicode(parsed), notes
def get_lemmas(line):
    line_els = do_split(r"\s+", line)
    if args.pos:
        lemmas = line_els[0]
    else:
        if len(line_els) < 2:
            fatal(line, "Not enough elements in line")
        lemmas = line_els[1]
    starts_with_exclamation_point = False
    if lemmas.startswith("!"):
        starts_with_exclamation_point = True
        lemmas = lemmas[1:]
    lemmas = remove_links(lemmas).split(",")
    first_lemma_no_accents = module.remove_accents(lemmas[0])
    return lemmas, first_lemma_no_accents, starts_with_exclamation_point
Beispiel #25
0
def process_page_for_modification(index, pagetitle, text, new_pronuns):
    if pagetitle not in new_pronuns:
        return

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    retval = blib.find_modifiable_lang_section(text, "Old English", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find Old English section")
        return
    sections, j, secbody, sectail, has_non_lang = retval
    heads = None
    if "Etymology 1" in secbody:
        etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0,
                                 re.M)
        for k in xrange(2, len(etym_sections), 2):
            parsed = blib.parse_text(etym_sections[k])
            secheads = []
            for t in parsed.filter_templates():
                this_heads = get_head_param(t, pagetitle)
                if this_heads:
                    this_heads = [blib.remove_links(x) for x in this_heads]
                    for head in this_heads:
                        if head not in secheads:
                            secheads.append(head)
            if heads is None:
                heads = secheads
            elif set(heads) != set(secheads):
                pagemsg(
                    "Saw head(s) %s in one etym section and %s in another, splitting pronuns per etym section"
                    % (",".join(heads), ",".join(secheads)))
                for k in xrange(2, len(etym_sections), 2):
                    etym_sections[k] = process_section_for_modification(
                        index, pagetitle, etym_sections[k], 4,
                        new_pronuns[pagetitle])
                sections[j] = "".join(etym_sections) + sectail
                return "".join(
                    sections), "add pronunciation(s) to Old English lemma(s)"
        pagemsg(
            "All etym sections have same head(s) %s, creating a single pronun section"
            % ",".join(heads))
    secbody = process_section_for_modification(index, pagetitle, secbody, 3,
                                               new_pronuns[pagetitle])
    sections[j] = secbody + sectail
    return "".join(sections), "add pronunciation(s) to Old English lemma(s)"
Beispiel #26
0
  def fix_up_section(sectext, warn_on_multiple_heads):
    parsed = blib.parse_text(sectext)

    heads = set()
    pronun_templates = []
    for t in parsed.filter_templates():
      tn = tname(t)
      if lalib.la_template_is_head(t):
        heads |= set(blib.remove_links(x) for x in lalib.la_get_headword_from_template(t, pagetitle, pagemsg))
      elif tn == "la-IPA":
        pronun_templates.append(t)
    if len(heads) > 1:
      if warn_on_multiple_heads:
        pagemsg("WARNING: Found multiple possible heads, not modifying: %s" % ",".join(heads))
      return sectext
    if len(heads) == 0:
      pagemsg("WARNING: Found no possible heads, not modifying: %s" % ",".join(heads))
      return sectext
    newsectext = re.sub(r"\{\{a\|Classical\}\} \{\{IPA(char)?\|.*?\}\}", "{{la-IPA|%s}}" % list(heads)[0], sectext)
    newsectext = re.sub(r"^\* \{\{IPA(char)?\|.*?\|lang=la\}\}", "{{la-IPA|%s}}" % list(heads)[0], newsectext, 0, re.M)
    if newsectext != sectext:
      notes.append("replaced manual Latin pronun with {{la-IPA|%s}}" % list(heads)[0])
      sectext = newsectext
    # Recompute pronun templates as we may have added one.
    parsed = blib.parse_text(sectext)
    pronun_templates = []
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn == "la-IPA":
        pronun_templates.append(t)
    if "{{a|Ecclesiastical}} {{IPA" in sectext:
      if len(pronun_templates) == 0:
        pagemsg("WARNING: Found manual Ecclesiastical pronunciation but not {{la-IPA}} template")
      elif len(pronun_templates) > 1:
        pagemsg("WARNING: Found manual Ecclesiastical pronunciation and multiple {{la-IPA}} templates: %s" %
          ",".join(unicode(tt) for tt in pronun_templates))
      else:
        origt = unicode(pronun_templates[0])
        pronun_templates[0].add("eccl", "yes")
        pagemsg("Replaced %s with %s" % (origt, unicode(pronun_templates[0])))
        newsectext = re.sub(r"^\* \{\{a\|Ecclesiastical\}\} \{\{IPA(char)?\|.*?\}\}\n", "",
            sectext, 0, re.M)
        if newsectext == sectext:
          pagemsg("WARNING: Unable to remove manual Ecclesiastical prounciation")
        else:
          notes.append("removed manual Ecclesiastical pronunciation and added |eccl=yes to {{la-IPA}}")
          sectext = newsectext
    return sectext
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn in hindi_head_templates:
            maxtr = 1
            for i in range(1, 10):
                if getparam(t, "tr" if i == 1 else "tr%s" % i):
                    maxtr = i
            for i in range(1, maxtr + 1):
                trparam = "tr" if i == 1 else "tr%s" % i
                tr = getparam(t, trparam)
                if tr:
                    pagemsg("Manual translit tr=%s in %s, not checking" %
                            (tr, unicode(t)))
                else:
                    headparam = "head" if i == 1 else "head%s" % i
                    head = getparam(t, headparam)
                    if head:
                        head = blib.remove_links(head)
                    else:
                        head = pagetitle
                    newtr = expand_text("{{xlit|hi|%s}}" % head)
                    oldtr = expand_text(
                        "{{#invoke:User:Benwing2/hi-translit|tr|%s}}" % head)
                    if newtr and oldtr:
                        if newtr == oldtr:
                            pagemsg(
                                "Auto translit %s same in new and old: %s" %
                                (newtr, unicode(t)))
                        else:
                            pagemsg(
                                "WARNING: Different translit, new=%s, old=%s: %s"
                                % (newtr, oldtr, unicode(t)))
Beispiel #28
0
def tr(text, lang=None, sc=None, msgfun=msg):
    text = remove_links(text)
    text = tr_canonicalize_bulgarian(text)

    # Remove word-final hard sign
    text = rsub(text, u"[Ъъ]($|[- \]])", ur"\1")

    # ьо becomes jo, Ьо becomes Jo
    text = rsub(text, u"ь(?=[Оо])", ur"j")
    text = rsub(text, u"Ь(?=[Оо])", ur"J")
    text = rsub(text, '.', tt)

    # compose accented characters
    text = tr_canonicalize_latin(text)

    return text
def infer_decl(t, pagemsg):
    if verbose:
        pagemsg("Processing %s" % unicode(t))

    forms = {}

    # Initialize all cases to blank in case we don't set them again later
    for case, numparam in short_adj_cases_params:
        form = getparam(t, case) or getparam(t, numparam)
        form = form.strip()
        form = blib.remove_links(form)
        forms[case] = form

    def get_form(case):
        if forms[case] == "-":
            return ""
        return forms[case]
Beispiel #30
0
 def compare_headword_conj_forms(id_slot,
                                 headword_forms,
                                 conj_slots,
                                 adjust_for_missing_perf_forms=False,
                                 remove_conj_links=False):
     conj_forms = ""
     for slot in conj_slots:
         if slot in verb_props:
             conj_forms = verb_props[slot]
             break
     conj_forms = safe_split(conj_forms, ",")
     if remove_conj_links:
         conj_forms = [blib.remove_links(x) for x in conj_forms]
     corrected_headword_forms = [
         lengthen_ns_nf(x) for x in headword_forms
     ]
     corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms]
     if adjust_for_missing_perf_forms:
         # There are several instances of 4++ verbs where only the -īvī variant,
         # not the -iī variant, is listed in the headword. Don't get tripped up
         # by that.
         ivi_conj_forms = [
             x for x in corrected_conj_forms if x.endswith(u"īvī")
         ]
         for ivi_conj_form in ivi_conj_forms:
             ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form)
             if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms:
                 corrected_headword_forms.append(ii_conj_form)
     if set(corrected_headword_forms) != set(corrected_conj_forms):
         macronless_headword_forms = set(
             lalib.remove_macrons(x) for x in corrected_headword_forms)
         macronless_conj_forms = set(
             lalib.remove_macrons(x) for x in corrected_conj_forms)
         if macronless_headword_forms == macronless_conj_forms:
             pagemsg(
                 "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s"
                 % (id_slot, ",".join(headword_forms), id_slot,
                    ",".join(conj_forms), render_headword_and_conj()))
         else:
             pagemsg(
                 "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s"
                 % (id_slot, ",".join(headword_forms), id_slot,
                    ",".join(conj_forms), render_headword_and_conj()))
         return False
     return True
Beispiel #31
0
 def compare_single_form(f1, f2):
     words1 = re.split("[ -]", f1)
     words2 = re.split("[ -]", f2)
     if len(words1) != len(words2):
         return None
     for i in xrange(len(words1)):
         if words1[i] != words2[i]:
             w1 = fixup_link(words1[i])
             w2 = words2[i]
             # Allow case where existing is monosyllabic and missing a stress
             # compared with proposed
             w1 = {w1, try_to_stress(w1)}
             # Allow case where existing is missing a link as compared to
             # proposed (but not other way around; we don't want a link
             # disappearing)
             w2 = {w2, blib.remove_links(w2)}
             if not (w1 & w2):
                 return None
     return True
Beispiel #32
0
 def compare_single_form(f1, f2):
   words1 = re.split("[ -]", f1)
   words2 = re.split("[ -]", f2)
   if len(words1) != len(words2):
     return None
   for i in xrange(len(words1)):
     if words1[i] != words2[i]:
       w1 = fixup_link(words1[i])
       w2 = words2[i]
       # Allow case where existing is monosyllabic and missing a stress
       # compared with proposed
       w1 = {w1, try_to_stress(w1)}
       # Allow case where existing is missing a link as compared to
       # proposed (but not other way around; we don't want a link
       # disappearing)
       w2 = {w2, blib.remove_links(w2)}
       if not (w1 & w2):
         return None
   return True
 def compare(old, new, stuff, nocanon=False):
     if not old:
         return True
     if not nocanon:
         remove_monosyllabic_accents = (
             uk.remove_monosyllabic_stress if args.lang == "uk" else
             be.remove_monosyllabic_accents)
         old = [
             remove_monosyllabic_accents(blib.remove_links(x))
             for x in old
         ]
         new = [remove_monosyllabic_accents(x) for x in new]
     if set(old) != set(new):
         pagemsg(
             "WARNING: Old %ss %s disagree with new %ss %s: head=%s, decl=%s"
             % (stuff, ",".join(old), stuff, ",".join(new),
                unicode(headt), unicode(t)))
         return False
     return True
def compare_headword_decl_forms(id_slot, headword_forms, decl_slots, noun_props,
    headword_and_decl_text, pagemsg, adjust_for_missing_gen_forms=False,
    adjust_for_e_ae_gen=False, remove_headword_links=False):
  decl_forms = ""
  for slot in decl_slots:
    if slot in noun_props:
      decl_forms = noun_props[slot]
      break
  decl_forms = safe_split(decl_forms, ",")
  if remove_headword_links:
    headword_forms = [blib.remove_links(x) for x in headword_forms]
  corrected_headword_forms = [lengthen_ns_nf(x) for x in headword_forms]
  corrected_decl_forms = [lengthen_ns_nf(x) for x in decl_forms]
  if adjust_for_e_ae_gen:
    corrected_headword_forms = [re.sub(u"ē$", "ae", x) for x in headword_forms]
  if adjust_for_missing_gen_forms:
    # Nouns in -ius and -ium are commonly missing the shortened genitive
    # variants. Don't get tripped up by that.
    ii_decl_forms = [x for x in corrected_decl_forms if x.endswith(u"iī")]
    for ii_decl_form in ii_decl_forms:
      i_decl_form = re.sub(u"iī$", u"ī", ii_decl_form)
      if i_decl_form in corrected_decl_forms and i_decl_form not in corrected_headword_forms:
        corrected_headword_forms.append(i_decl_form)
  if set(corrected_headword_forms) != set(corrected_decl_forms):
    macronless_headword_forms = set(lalib.remove_macrons(x) for x in corrected_headword_forms)
    macronless_decl_forms = set(lalib.remove_macrons(x) for x in corrected_decl_forms)
    if macronless_headword_forms == macronless_decl_forms:
      pagemsg("WARNING: Headword %s=%s different from decl %s=%s in macrons only, skipping: %s" % (
        id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms),
        headword_and_decl_text
      ))
    else:
      pagemsg("WARNING: Headword %s=%s different from decl %s=%s in more than just macrons, skipping: %s" % (
        id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms),
        headword_and_decl_text
      ))
    return False
  return True
Beispiel #35
0
def process_page(page, index):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    seen_trans = [pagetitle]
    parsed = blib.parse(page)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["t", "t+", "t-", "t+check", "t-check"]:
            trans = blib.remove_links(getparam(t, "2"))
            if trans not in seen_trans:
                seen_trans.append(trans)
    for trans in seen_trans:

        def pagemsg_with_trans(txt):
            pagemsg("%s: %s" % (trans, txt))

        if blib.safe_page_exists(pywikibot.Page(site, trans),
                                 pagemsg_with_trans):
            msg("Page %s %s: Found existing translation for %s" %
                (index, trans, pagetitle))
 def add(val, tr):
   val_to_add = blib.remove_links(val)
   if val_to_add:
     heads.add((val_to_add, tr))
def split_one_page_etymologies(page, index, pagetext, verbose):

  # Fetch pagename, create pagemsg() fn to output msg with page name included
  pagename = page.title()
  pagetext = unicode(pagetext)
  def pagemsg(text):
    msg("Page %s %s: %s" % (index, pagename, text))

  comment = None
  notes = []

  # Split off interwiki links at end
  m = re.match(r"^(.*?\n)(\n*(\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
      pagetext, re.S)
  if m:
    pagebody = m.group(1)
    pagetail = m.group(2)
  else:
    pagebody = pagetext
    pagetail = ""

  # Split into sections
  splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M)
  # Extract off pagehead and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Go through each section in turn, looking for existing Arabic section
  for i in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[i], re.M)
    if not m:
      pagemsg("WARNING: Can't find language name in text: [[%s]]" % (sections[i]))
    elif m.group(1) == "Arabic":
      # Extract off trailing separator
      mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S)
      if mm:
        sections[i:i+1] = [mm.group(1), mm.group(2)]
      elif i < len(sections) - 1:
        pagemsg("WARNING: Arabic language section %s is non-final and missing trailing separator" % i)

      for mm in re.finditer("^(==+)[^=\n](==+)$", sections[i], re.M):
        if mm.group(1) != mm.group(2):
          pagemsg("WARNING: Malconstructed header: %s" % mm.group(0))

      subsections = re.split("(^===[^=\n]+=+\n)", sections[i], 0, re.M)
      if len(subsections) < 2:
        pagemsg("WARNING: Page missing any entries")

      etymologies = []
      etymsections = []
      sechead = subsections[0]
      if "\n===Etymology 1=" in sections[i]:
        etyms_were_separate = True
        for j in xrange(1, len(subsections), 2):
          if not re.match("^===Etymology [0-9]+=", subsections[j]):
            pagemsg("WARNING: Non-etymology level-3 header when split etymologies: %s" % subsections[j][0:-1])
        etymsections = [subsections[j] for j in xrange(2, len(subsections), 2)]
        # Reduce indent by one. We will increase it again when we split
        # etymologies.
        for j in xrange(len(etymsections)):
          etymsections[j] = re.sub("^==", "=", etymsections[j], 0, re.M)
      else:
        etyms_were_separate = False
        etymsections = ''.join(subsections[1:])

      for etymsection in etymsections:
        subsections = re.split("(^===[^=\n]+=+\n)", etymsection, 0, re.M)
        if len(subsections) < 2:
          pagemsg("WARNING: Section missing any entries")
        split_sections = []
        next_split_section = 0
        def append_section(k):
          while len(split_sections) <= next_split_section:
            split_sections.append("")
          split_sections[next_split_section] += \
              subsections[k] + subsections[k + 1]

        last_lemma = None
        last_inflection_of_lemma = None
        for j in xrange(1, len(subsections), 2):
          if re.match("^===+(References|Related|See)", subsections[j]):
            pagemsg("Found level-3 section that should maybe be at higher level: %s" % subsections[j][0:-1])
            append_section(j)
          elif re.match("^===+(Alternative|Etymology)", subsections[j]):
            append_section(j)
          else:
            parsed = blib.parse_text(subsections[j + 1])
            lemma = None
            inflection_of_lemma = None
            for t in parsed.filter_templates():
              if t.name in arabic_all_headword_templates:
                if lemma:
                  if t.name not in ["ar-nisba", "ar-noun-nisba", "ar-verb",
                      "ar-verb-form"]:
                    pagemsg("Found multiple headword templates in section %s: %s" % (j, subsections[j][0:-1]))
                # Note: For verbs this is the form class, which we match on
                lemma = reorder_shadda(remove_links(getparam(t, "1")))
              if t.name == "inflection of":
                if inflection_of_lemma:
                  pagemsg("Found multiple 'inflection of' templates in section %s: %s" % (j, subsections[j][0:-1]))
                inflection_of_lemma = remove_diacritics(
                    remove_links(getparam(t, "1")))
            if not lemma:
              pagemsg("Warning: No headword template in section %s: %s" % (j, subsections[j][0:-1]))
              append_section(j)
            else:
              if lemma != last_lemma:
                next_split_section += 1
              elif (inflection_of_lemma and last_inflection_of_lemma and
                  inflection_of_lemma != last_inflection_of_lemma):
                pagemsg("Verb forms have different inflection-of lemmas %s and %s, splitting etym" % (
                  last_inflection_of_lemma, inflection_of_lemma))
                next_split_section += 1
              last_lemma = lemma
              last_inflection_of_lemma = inflection_of_lemma
              append_section(j)
        etymologies += split_sections

      # Combine adjacent etymologies with same verb form class I.
      # FIXME: We might not want to do this; the etymologies might be
      # legitimately split. Need to check each case.
      j = 0
      while j < len(etymologies) - 1:
        def get_form_class(k):
          formclass = None
          parsed = blib.parse_text(etymologies[j])
          for t in parsed.filter_templates():
            if t.name in ["ar-verb", "ar-verb-form"]:
              newformclass = getparam(t, "1")
              if formclass and newformclass and formclass != newformclass:
                pagemsg("WARNING: Something wrong: Two different verb form classes in same etymology: %s != %s" % (formclass, newformclass))
              formclass = newformclass
          return formclass

        formclassj = get_form_class(j)
        formclassj1 = get_form_class(j + 1)
        if formclassj == "I" and formclassj1 == "I":
          if not etymologies[j + 1].startswith("="):
            pagemsg("WARNING: Can't combine etymologies with same verb form class because second has etymology text")
          else:
            pagemsg("Combining etymologies with same verb form class I")
            etymologies[j] = etymologies[j].rstrip() + "\n\n" + etymologies[j + 1]
            # Cancel out effect of incrementing j below since we combined
            # the following etymology into this one
            j -= 1
        j += 1

      if len(etymologies) > 1:
        for j in xrange(len(etymologies)):
          # Stuff like "===Alternative forms===" that goes before the
          # etymology section should be moved after.
          newetymj = re.sub(r"^(.*?\n)(===Etymology===\n(\n|[^=\n].*?\n)*)",
              r"\2\1", etymologies[j], 0, re.S)
          if newetymj != etymologies[j]:
            pagemsg("Moved ===Alternative forms=== and such after Etymology")
            etymologies[j] = newetymj
          # Remove ===Etymology=== from beginning
          etymologies[j] = re.sub("^===Etymology===\n", "", etymologies[j])
          # Fix up newlines around etymology section
          etymologies[j] = etyomologies[j].strip() + "\n\n"
          if etymologies[j].startswith("="):
            etymologies[j] = "\n" + etymologies[j]
        sections[i] = (sechead +
            ''.join(["===Etymology %s===\n" % (j + 1) + etymologies[j]
              for j in xrange(len(etymologies))]))
      elif len(etymologies) == 1:
        if etyms_were_separate:
          # We might need to add an Etymology header at the beginning.
          pagemsg("Combined formerly separate etymologies")
          if not re.match(r"^(=|\{\{wikipedia|\[\[File:)",
              etymologies[0].strip()):
            etymologies[0] = "===Etymology===\n" + etymologies[0]
            pagemsg("Added Etymology header when previously separate etymologies combined")
          # Put Alternative forms section before Etymology.
          newetym0 = re.sub(r"^((?:\n|[^=\n].*?\n)*)(===Etymology===\n(?:\n|[^=\n].*?\n)*)(===(Alternative.*?)===\n(?:\n|[^=\n].*?\n)*)",
              r"\1\3\2", etymologies[0], 0, re.S)
          if newetym0 != etymologies[0]:
            pagemsg("Moved ===Alternative forms=== and such before Etymology")
            etymologies[0] = newetym0

        sections[i] = sechead + etymologies[0]
      else:
        sections[i] = sechead

      break

  # End of loop over sections in existing page; rejoin sections
  newtext = pagehead + ''.join(sections) + pagetail

  # Don't signal a save if only differences are whitespace at end,
  # since it appears that newlines at end get stripped when saving.
  if pagetext.rstrip() == newtext.rstrip():
    pagemsg("No change in text")
  else:
    if verbose:
      pagemsg("Replacing [[%s]] with [[%s]]" % (pagetext, newtext))
    else:
      pagemsg("Text has changed")
    pagetext = newtext

    # Construct and output comment.
    notestext = '; '.join(notes)
    if notestext:
      if comment:
        comment += " (%s)" % notestext
      else:
        comment = notestext
    assert(comment)
    pagemsg("comment = %s" % comment, simple = True)

  return pagetext, comment
Beispiel #38
0
def check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg, laxer_comparison=False):
  # FORM1 is the forms from ru-noun (or ru-proper noun); FORM2 is the combined
  # set of forms from ru-noun-table, and needs to be split on commas.
  # FORM1_LEMMA is true if the FORM1 values come from the ru-noun lemma.
  def compare_forms(case, form1, form2, form1_lemma=False):
    # Split on individual words and allow monosyllabic accent differences.
    # FIXME: Will still have problems with [[X|Y]].
    def compare_single_form(f1, f2):
      words1 = re.split("[ -]", f1)
      words2 = re.split("[ -]", f2)
      if len(words1) != len(words2):
        return None
      for i in xrange(len(words1)):
        if words1[i] != words2[i]:
          w1 = fixup_link(words1[i])
          w2 = words2[i]
          # Allow case where existing is monosyllabic and missing a stress
          # compared with proposed
          w1 = {w1, try_to_stress(w1)}
          # Allow case where existing is missing a link as compared to
          # proposed (but not other way around; we don't want a link
          # disappearing)
          w2 = {w2, blib.remove_links(w2)}
          if not (w1 & w2):
            return None
      return True
    form1 = [fixup_link(re.sub(u"ё́", u"ё", x)) for x in form1]
    form2 = re.split(",", form2)
    if laxer_comparison or not form1_lemma:
      # Ignore manual translit in decl forms when comparing non-lemma forms;
      # not available from ru-noun (and not displayed anyway); also when
      # laxer_comparison is set, which happens in add_noun_decl
      form2 = [re.sub("//.*$", "", x) for x in form2]
    # If existing value missing, OK; also allow for unstressed monosyllabic
    # existing form matching stressed monosyllabic new form
    if form1:
      if (set(form1) == set(form2) or
          set(try_to_stress(x) for x in form1) == set(form2) or
          len(form1) == 1 and len(form2) == 1 and compare_single_form(form1[0], form2[0])):
        pass
      else:
        pagemsg("WARNING: case %s, existing forms %s not same as proposed %s" %(
            case, ",".join(form1), ",".join(form2)))
        return None
    return True

  def compare_genders(g1, g2):
    if set(g1) == set(g2):
      return True
    if len(g1) == 1 and len(g2) == 1:
      # If genders don't match exactly, check if existing gender is missing
      # animacy and allow that, so it gets overwritten with new gender
      if g1[0] == re.sub("-(an|in)", "", g2[0]):
        pagemsg("Existing gender %s missing animacy spec compared with proposed %s, allowed" % (
          ",".join(g1), ",".join(g2)))
        return True
    return None

  headwords = blib.fetch_param_chain(headword_template, "1", "head", subpagetitle)
  translits = blib.fetch_param_chain(headword_template, "tr", "tr")
  for i in xrange(len(translits)):
    if len(headwords) <= i:
      pagemsg("WARNING: Not enough headwords for translit tr%s=%s, skipping" % (
        "" if i == 0 else str(i+1), translits[i]))
      return None
    else:
      headwords[i] += "//" + translits[i]
  genitives = blib.fetch_param_chain(headword_template, "3", "gen")
  plurals = blib.fetch_param_chain(headword_template, "4", "pl")
  genders = blib.fetch_param_chain(headword_template, "2", "g")
  cases_to_check = None
  if args["n"] == "s":
    if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or
        not compare_forms("gen_sg", genitives, args["gen_sg"])):
      pagemsg("Existing and proposed forms not same, skipping")
      return None
    cases_to_check = ["nom_sg", "gen_sg"]
  elif args["n"] == "p":
    if (not compare_forms("nom_pl", headwords, args["nom_pl_linked"], True) or
        not compare_forms("gen_pl", genitives, args["gen_pl"])):
      pagemsg("Existing and proposed forms not same, skipping")
      return None
    cases_to_check = ["nom_pl", "gen_pl"]
  elif args["n"] == "b":
    if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or
        not compare_forms("gen_sg", genitives, args["gen_sg"]) or
        not compare_forms("nom_pl", plurals, args["nom_pl"])):
      pagemsg("Existing and proposed forms not same, skipping")
      return None
    cases_to_check = ["nom_sg", "gen_sg", "nom_pl"]
  else:
    pagemsg("WARNING: Unrecognized number spec %s, skipping" % args["n"])
    return None

  for case in cases_to_check:
    raw_case = re.sub(u"△", "", blib.remove_links(args[case + "_raw"]))
    if args[case] != raw_case:
      pagemsg("WARNING: Raw case %s=%s contains footnote symbol" % (
        case, args[case + "_raw"]))

  proposed_genders = re.split(",", args["g"])
  if compare_genders(genders, proposed_genders):
    genders = []
  else:
    # Check for animacy mismatch, punt if so
    cur_in = [x for x in genders if re.search(r"\bin\b", x)]
    cur_an = [x for x in genders if re.search(r"\ban\b", x)]
    proposed_in = [x for x in proposed_genders if re.search(r"\bin\b", x)]
    proposed_an = [x for x in proposed_genders if re.search(r"\ban\b", x)]
    if (cur_in or not cur_an) and proposed_an or (cur_an or not cur_in) and proposed_in:
      pagemsg("WARNING: Animacy mismatch, skipping: cur=%s proposed=%s" % (
        ",".join(genders), ",".join(proposed_genders)))
      return None
    # Check for number mismatch, punt if so
    cur_pl = [x for x in genders if re.search(r"\bp\b", x)]
    if cur_pl and args["n"] != "p" or not cur_pl and args["n"] == "p":
      pagemsg("WARNING: Number mismatch, skipping: cur=%s, proposed=%s, n=%s" % (
        ",".join(genders), ",".join(proposed_genders), args["n"]))
      return None
    pagemsg("WARNING: Gender mismatch, existing=%s, new=%s" % (
      ",".join(genders), ",".join(proposed_genders)))

  return genders
Beispiel #39
0
            lemma = re.sub(u"([кгхшжчщ])ый$", r"\1ий", lemma)
            pagemsg("WARNING: Inferring adjectival lemma from inflection, please check: lemma=%s, infl=%s" %
                (lemma, infl))
            break
        else:
          pagemsg("WARNING: Assuming word is inflected adj or noun, please check: lemma=%s, infl=%s" %
              (lemma, infl))
      else:
        infl = word
        lemma = ru.remove_accents(infl)
        saw_unlinked_word = True
    lemmas_infls.append((lemma, infl))

  if see_template:
    pagemsg("Found decl-see template: %s" % unicode(see_template))
    inflected_words = set(ru.remove_accents(blib.remove_links(unicode(x.value)))
        for x in see_template.params)
    if saw_unlinked_word:
      pagemsg("WARNING: Unlinked word(s) in headword, found decl-see template, proceeding, please check: %s" % headword)
  else:
    # Try to figure out which words are inflected and which words aren't
    pagemsg("No ru-decl-noun-see template, inferring which headword words are inflected")
    if saw_unlinked_word:
      pagemsg("WARNING: Unlinked word(s) in headword, no decl-see template, skipping: %s" % headword)
      return
    inflected_words = set()
    saw_noun = False
    reached_uninflected = False
    wordind = 0
    for word, lemmainfl in zip(headwords, lemmas_infls):
      wordind += 1
 def check_bad_head(text, arg):
   canontext = re.sub(u"[׳’]", "'", blib.remove_links(text))
   canonpagetitle = re.sub(u"[׳’]", "'", pagetitle)
   if canontext != canonpagetitle:
     pagemsg("WARNING: Canonicalized %s=%s not same as canonicalized page title %s (orig %s=%s)" %
         (arg, canontext, canonpagetitle, arg, text))
def infer_decl(t, pagemsg):
  if verbose:
    pagemsg("Processing %s" % unicode(t))

  forms = {}

  # Initialize all cases to blank in case we don't set them again later
  for case, numparam in short_adj_cases_params:
    form = getparam(t, case) or getparam(t, numparam)
    form = form.strip()
    form = blib.remove_links(form)
    forms[case] = form

  def get_form(case):
    if forms[case] == "-":
      return ""
    return forms[case]

  m = get_form("short_m")
  f = get_form("short_f")
  n = get_form("short_n")
  p = get_form("short_p")

  specials = ["", m]
  explicit_msg = None

  stem = getparam(t, "1")
  decl = getparam(t, "2")
  if not m and not f and not n and not p:
    pagemsg("No short forms, skipping")
    return None
  elif not m and f and n and p:
    pagemsg("Missing short masculine but other short forms present, continuing")
  elif m and not f and not n and not p:
    pagemsg("Found only short m")
    stem, decl = combine_stem(stem, decl)
    args = [stem, decl] + ["short_m=%s" % m]
    if trymatch(t, args, pagemsg):
      return args
    else:
      return None
  elif not m or not f or not n or not p:
    pagemsg("WARNING: Some short forms missing, skipping: m=%s, f=%s, n=%s, p=%s" % (m or "blank", f or "blank", n or "blank", p or "blank"))
    return None
  if re.search("(^|:)[abc*]", decl):
    pagemsg("WARNING: Decl spec %s already has short accent class but short forms present? Skipping ...")
    return None
  if not decl:
    newstem, decl = detect_stem(stem, decl)
    if not decl:
      pagemsg("WARNING: Unable to detect stem type for stem=%s" % stem)
      return None
    stem = newstem
  if decl == "short" or decl == "mixed" or decl == u"ьий":
    if f or n or p:
      pagemsg("WARNING: Short forms found when not allowed: f=%s, n=%s, p=%s" % (f or "blank", n or "blank", p or "blank"))
      return None
    pagemsg("Skipping decl type %s, no short forms allowed" % decl)
    return None
  if "," in m:
    pagemsg("WARNING: Multiple masculine forms, something wrong: m=%s" % m)
    return None
  f2 = "," in f
  n2 = "," in n
  p2 = "," in p
  def get_stressed_form(form):
    if "," not in form:
      return form
    forms = re.split("\s*,\s*", form)
    if len(forms) > 2:
      pagemsg("WARNING: More than two forms in %s" % form)
      return None
    for frm in forms:
      if not re.search(AC + "$", frm):
        return frm
    pagemsg("WARNING: Multiple forms but none stem-stressed: %s" % form)
    return forms[0]
  sf = get_stressed_form(f)
  sn = get_stressed_form(n)
  sp = get_stressed_form(p)
  fend = re.search(AC + "$", f)
  nend = re.search(AC + "$", n)
  pend = re.search(AC + "$", p)
  mm = re.search(u"^(.*)[ая]́?$", sf)
  if not mm:
    pagemsg("WARNING: Unable to recognize feminine ending: %s" % sf)
    return None
  fstem = mm.group(1)
  mm = re.search(u"^(.*)[оеё]́?$", sn)
  if not mm:
    pagemsg("WARNING: Unable to recognize neuter ending: %s" % sn)
    return None
  nstem = mm.group(1)
  mm = re.search(u"^(.*)[ыи]́?$", sp)
  if not mm:
    pagemsg("WARNING: Unable to recognize plural ending: %s" % sp)
    return None
  pstem = mm.group(1)
  mm = re.search(u"^(.*?)[ъьй]?$", m)
  assert mm
  mstem = mm.group(1)
  short_stem = stem
  if is_stressed(fstem):
    short_stem = fstem
  elif is_stressed(nstem):
    short_stem = nstem
  elif is_stressed(pstem):
    short_stem = pstem
  else:
    if make_unstressed_once(fstem) == make_unstressed_once(mstem):
      short_stem = mstem
  if is_unstressed(stem):
    stem = make_ending_stressed(stem)
  short_stem = try_to_stress(short_stem)
  if stem == short_stem:
    short_stem = ""
  elif short_stem + u"н" == stem and re.search(u"нн[иы]й$", stem + decl):
    pagemsg("Found special (2): short stem %s, long stem %s" % (short_stem, stem))
    specials = ["(2)"]
    short_stem = ""
  else:
    pagemsg("WARNING: Found short stem %s different from long stem %s" %
        (short_stem, stem))
  real_short_stem = short_stem or stem
  if specials != ["(2)"] and mstem != real_short_stem:
    if mstem + u"н" == real_short_stem and re.search(u"нн$", real_short_stem):
      pagemsg("Found special (1): short stem %s, masculine stem %s" % (
        real_short_stem, mstem))
      specials = ["(1)"]
    elif make_unstressed_once(stem) == mstem:
      # Can happen with monosyllabic masculines
      pass
    elif not m:
      pagemsg("Missing short masculine singular")
      if real_short_stem.endswith(u"нн"):
        specials = ["(1)"]
      explicit_msg = "-"
    else:
      pagemsg("Masculine short stem %s differs from short stem %s, presumed reducible" % (mstem, real_short_stem))
      if "(1)" in specials or "(2)" in specials:
        pagemsg("WARNING: Can't have reducible and special together")
        return None
      specials = ["*", m]
  ff = f2 and "both" or fend and "end" or "stem"
  nn = n2 and "both" or nend and "end" or "stem"
  pp = p2 and "both" or pend and "end" or "stem"
  def match(fval, nval, pval):
    return ff == fval and nn == nval and pp == pval
  stress = (match("stem", "stem", "stem") and "a" or
            match("both", "stem", "stem") and "a'" or
            match("end", "end", "end") and "b" or
            match("end", "end", "both") and "b'" or
            match("end", "stem", "stem") and "c" or
            match("end", "stem", "both") and "c'" or
            match("end", "both", "both") and "c''" or
            None)
  if "*" in specials and not is_monosyllabic(m) and (
      (stress in ["b", "b'"]) != (not not is_ending_stressed(m))):
    pagemsg("WARNING: (De)reducible short masc sg %s has wrong stress for accent pattern %s, setting manual masc sg" % (m, stress))
    explicit_msg = m
  if not stress:
    pagemsg("WARNING: Unrecognized stress: m=%s f=%s n=%s p=%s" % (
      m, f, n, p))
    return None

  stem, decl = combine_stem(stem, decl)
  for special in specials:
    if special not in ["", "*", "(1)", "(2)"]:
      if explicit_msg:
        if special == explicit_msg:
          pass
        else:
          pagemsg("WARNING: Something wrong; trying to set explicit short masc sg %s when there's an existing setting %s" % (
            special, explicit_msg))
      else:
        explicit_msg = special
      special = ""
    special = stress + special
    declspec = special + (short_stem and (":" + short_stem) or "")
    if decl:
      declspec = decl + ":" + declspec
    args = [stem, declspec]
    if explicit_msg:
      args.append("short_m=" + explicit_msg)
    if trymatch(t, args, pagemsg):
      return args
  pagemsg("WARNING: Unable to infer short accent")
  return None
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = ru.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword and (not lemmas or pagetitle in lemmas):
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runoun.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runoun.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def create_declension(page, index, save, pos, tempname, decltempname, sgnum,
    removeparams, is_proper=False):
  pagename = page.title()
  comments = []

  def pgmsg(text):
    msg("Page %s %s: %s" % (index, pagename, text))

  # Starts with definite article al-
  def starts_with_al(text):
    return re.match(ALIF_ANY + A + "?" + L, text)

  def sub_if(fr, to, text):
    if re.search(fr, text):
      return re.sub(fr, to, text)
    else:
      return ""

  # Remove definite article al- from text
  def remove_al(text):
    return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text)
        or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text)
        or text)

  # Remove definite article al- from transliterated text
  def remove_al_tr(text):
    return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or
        sub_if("^a?l-", "", text) or
        text)

  # Split off interwiki links at end
  m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
      page.text, re.S)
  if m:
    pagebody = m.group(1)
    pagetail = m.group(2)
  else:
    pagebody = page.text
    pagetail = ""

  # Split top-level sections (by language)
  splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M)

  # Extract off head and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Look for Arabic section
  for seci in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[seci], re.M)
    if not m:
      pgmsg("Can't find language name in text: [[%s]]" % (sections[seci]))
    elif m.group(1) == "Arabic":
      # Extract off trailing separator
      mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S)
      if mm:
        secbody = mm.group(1)
        sectail = mm.group(2)
      else:
        secbody = sections[seci]
        sectail = ""

      # Split into subsections based on headers
      subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

      # Go through each subsection
      for j in xrange(len(subsections)):
        notes = []

        def add_note(note):
          if note not in notes:
            notes.append(note)

        # Look for subsections matching the given POS
        if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]):
          # Call reorder_shadda here so the templates we work with have
          # shadda in correct order but we don't mess with other text to
          # avoid unnecessary saving
          parsed = blib.parse_text(reorder_shadda(subsections[j]))

          def pagemsg(text):
            pgmsg("%s: [[%s]]" % (text, subsections[j]))

          # Check for various conditions causing us to skip this entry and
          # not try to add a declension table

          # Skip declension if certain templates found in definition.
          # We don't check for {{alternative form of|...}}, because it's
          # used for e.g. different ways of spelling "camera" in Arabic,
          # some with -ā and some with -a, so we still want to create
          # declensions for those.
          altspelling_templates = [temp for temp in parsed.filter_templates() if temp.name in
              ["alternative spelling of"]]
          if len(altspelling_templates) > 0:
            pagemsg("Alternative spelling redirect found in text, skipping")
            continue
          if pos == "Adjective":
            feminine_of_templates = [temp for temp in parsed.filter_templates() if temp.name in
                ["feminine of"]]
            if len(feminine_of_templates) > 0:
              pagemsg("feminine-of template found for adjective, skipping")
              continue

          # Retrieve headword_template, make sure exactly one and it is the right type
          headword_templates = [temp for temp in parsed.filter_templates() if temp.name in
              ["ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun",
                "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl",
                "ar-noun-inf-cons", "ar-adj-inf-def",
                "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba",
                "ar-adj-sound", "ar-adj-in", "ar-adj-an"]]
          if len(headword_templates) == 0:
            pagemsg("WARNING: Can't find headword template in text, skipping")
            continue
          if len(headword_templates) > 1:
            pagemsg("WARNING: Found multiple headword templates in text, skipping")
            continue
          headword_template = headword_templates[0]
          if headword_template.name != tempname:
            pagemsg("Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name))
            continue
          def getp(param):
            return getparam(headword_template, param)
          # NOTE: We physically add and remove parameters from the headword
          # template to get the list of parameters to use in creating the
          # declension template. These changes don't get propagated to the
          # headword template because we don't convert the parsed text back
          # to a string.
          def putp(param, value):
            addparam(headword_template, param, value)
          head = getp("1")
          orighead = head

          # Check for declension already present
          if (j + 1 < len(subsections) and
              re.match("^===+Declension===+\n", subsections[j + 1])
              or j + 3 < len(subsections) and
              re.match("^===+Usage", subsections[j + 1]) and
              re.match("^===+Declension===+\n", subsections[j + 3])
              ):
            pagemsg("Declension already found for head %s, skipping" % head)
            continue

          # Check for cpl
          # FIXME: Convert cpl into pl and fpl
          if getp("cpl"):
            pagemsg("WARNING: Headword template for head %s has cpl param in it, skipping" % (head))
            continue

          # Check for empty head. If w/o explicit translit, skip; else,
          # fetch head from page title.
          if not head:
            if not getp("tr"):
              pagemsg("WARNING: Headword template head is empty and without explicit translit, skipping")
              continue
            else:
              pagemsg("Headword template head is empty but has explicit translit")
              add_note("empty head, using page name")
            head = pagename
            putp("1", head)

          # Try to handle cases with a modifier; we can't handle all of them yet
          headspace = False
          if ' ' in head:
            headspace = True
            words = re.split(r"\s", remove_links(head))
            head = words[0]
            if len(words) > 2:
              pagemsg("WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead)
              continue
            assert(len(words) == 2)

            # Check for params we don't yet know how to handle
            must_continue = False
            for badparam in ["pl2", "pltr", "head2", "sing", "coll"]:
              if getp(badparam):
                # FIXME
                pagemsg("WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam))
                must_continue = True
                break
            if must_continue:
              continue

            # Now check for various types of construction, all either
            # construct (ʾidāfa) or adjectival

            def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext):
              if word.endswith(undia):
                pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word))
                add_note("removing %s i3rab (%s)" % (nomgen, undiatext))
                return re.sub(undia + "$", "", word)
              if word.endswith(udia):
                pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word))
                add_note("removing %s i3rab (%s)" % (nomgen, udiatext))
                return re.sub(udia + "$", "", word)
              if re.search(DIACRITIC_ANY_BUT_SH + "$", word):
                pagemsg("WARNING: Strange diacritic at end of %s %s" % (nomgen, word))
              if word[0] == ALIF_WASLA:
                pagemsg("Changing %s alif wasla to plain alif for %s" % (nomgen, word))
                add_note("changing %s alif wasla to plain alif" % (nomgen))
                word = ALIF + word[1:]
              return word

            def remove_gen_i3rab(word):
              return remove_nom_gen_i3rab(word, "genitive", IN, "IN", I, "I")

            def remove_nom_i3rab(word):
              return remove_nom_gen_i3rab(word, "nominative", UN, "UN", U, "U")

            def remove_gen_i3rab_tr(word):
              return remove_nom_gen_i3rab(word, "genitive", "in", "in", "i", "i")

            def remove_nom_i3rab_tr(word):
              return remove_nom_gen_i3rab(word, "nominative", "un", "un", "u", "u")

            idafa = False
            word0al = starts_with_al(words[0])
            word1al = starts_with_al(words[1])
            words[0] = remove_al(words[0])
            words[1] = remove_al(words[1])
            putp("1", words[0])
            putp("mod", words[1])
            if word0al and word1al:
              pagemsg("Headword template head %s has space in it and found definite adjective construction" % (orighead))
              add_note("modifier definite adjective construction")
              putp("state", "def")
            elif word0al and not word1al:
              pagemsg("WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead))
              continue
            elif is_proper:
              if words[0].endswith(ALIF) and word1al:
                pagemsg("Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead))
                add_note("modifier proper noun + definite adjective construction")
                putp("state", "ind-def")
              elif remove_diacritics(words[0]) == u"جمهورية":
                if word1al:
                  pagemsg("Proper noun headword template head %s has space in it and found definite idafa" % (orighead))
                  add_note("modifier definite idafa construction")
                  idafa = True
                  assert sgnum == "sg"
                  idafaval = "def"
                  putp("idafa", idafaval)
                elif words[1].endswith(ALIF):
                  pagemsg("Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead))
                  add_note("modifier proper-noun ind-def idafa construction")
                  assert sgnum == "sg"
                  idafaval = "ind-def"
                  putp("idafa", idafaval)
                else:
                  pagemsg("WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead))
                  continue
              else:
                  pagemsg("WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead))
                  continue

            elif not word0al and word1al:
              # Found an ʾidāfa construction
              pagemsg("Headword template head %s has space in it and found definite idafa" % (orighead))
              add_note("modifier definite idafa construction")
              idafa = True
              idafaval = "def-" + sgnum
              if idafaval == "def-sg":
                idafaval = "def"
              putp("idafa", idafaval)
            elif words[1].endswith(I + Y):
              pagemsg("WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead))
              continue
            elif words[1].endswith(I + Y + SH):
              pagemsg("Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead))
              add_note("modifier indefinite nisba adjective construction")
            elif pagename in adjectival_phrases:
              pagemsg("Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead))
              add_note("modifier indefinite adjective construction")
            else:
              pagemsg("Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead))
              add_note("modifier indefinite idafa construction")
              idafa = True
              putp("idafa", sgnum)

            # Now remove any i3rab diacritics
            putp("1", remove_nom_i3rab(getp("1")))
            if idafa:
              putp("mod", remove_gen_i3rab(getp("mod")))
            else:
              putp("mod", remove_nom_i3rab(getp("mod")))

            # Now check if the lemma is plural
            if re.match(r"\bp\b", getp("2")):
              pagemsg("Headword template head %s has space in it and is plural" % (orighead))
              add_note("plural lemma")
              if getp("tr"):
                # FIXME (doesn't occur though)
                pagemsg("WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead))
                continue
              putp("pl", getp("1"))
              putp("1", "-")
              if not idafa:
                putp("modpl", getp("mod"))
                putp("mod", "-")

            # Now check if lemma has plural specified
            elif getp("pl"):
              pls = re.split(r"\s", remove_links(getp("pl")))
              assert(len(pls) == 2)
              pls[0] = remove_al(pls[0])
              pls[1] = remove_al(pls[1])
              putp("pl", remove_nom_i3rab(pls[0]))
              if not idafa:
                putp("modpl", remove_nom_i3rab(pls[1]))
              else:
                if pls[1] != getp("mod"):
                  pagemsg("FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod")))

            # Now check if there's manual translit. We need to split the
            # manual translit in two and pair up manual translit with
            # corresponding Arabic words. But first remove -t indicating
            # construct state, and check to see if manual translit is
            # same as auto translit, in which case it's unnecessary.
            if getp("tr"):
              pagemsg("Headword template head %s has space in it and manual translit" % (orighead))
              trwords = re.split(r"\s", getp("tr"))
              assert(len(trwords) == 2)
              trwords[0] = remove_nom_i3rab_tr(remove_al_tr(trwords[0]))
              if idafa:
                trwords[1] = remove_gen_i3rab_tr(remove_al_tr(trwords[1]))
              else:
                trwords[1] = remove_nom_i3rab_tr(remove_al_tr(trwords[1]))
              # Remove any extraneous -t from translit, either from construct
              # state of from removal of i3rab in a feminine noun/adj.
              for i in [0, 1]:
                if words[i].endswith(TAM) and trwords[i].endswith("t"):
                  trwords[i] = trwords[i][0:-1]
                if words[i].endswith(ALIF + TAM) and not trwords[i].endswith("h"):
                  trwords[i] += "h"
              if ar_translit.tr(words[0]) != trwords[0]:
                pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0]))
                add_note("modified head w/manual translit")
                putp("1", "%s/%s" % (getp("1"), trwords[0]))
              else:
                pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0]))
                add_note("modified head w/ignored manual translit")
              if ar_translit.tr(words[1]) != trwords[1]:
                pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1]))
                add_note("modifier w/manual translit")
                putp("mod", "%s/%s" % (getp("mod"), trwords[1]))
              else:
                pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1]))
                add_note("modifier w/ignored manual translit")

          else:
            # no space in head, not dealing with a modifier

            # If has link in it, just remove it
            if '[' in head or ']' in head or '|' in head:
              pagemsg("Headword template head %s has link in it" % (head))
              add_note("removed links from head")
              head = remove_links(head)
              putp("1", head)

            # If starts with definite article, remove article from everything,
            # including transliterations, and set state=def
            if starts_with_al(head):
              pagemsg("Headword template head %s starts with definite article" % (head))
              add_note("definite lemma")
              head = remove_al(head)
              putp("1", head)
              putp("state", "def")
              # Also remove al- from remaining head and pl params
              def check_for_al(param):
                param = remove_links(param)
                value = getparam(headword_template, param)
                if value:
                  if '[' in value or ']' in value or '|' in value:
                    pagemsg("Param %s value %s has link in it" % (param, value))
                    add_note("removed links from %s" % param)
                    value = remove_links(value)
                  putp(param, remove_al(value))
              params_to_check = ["pl", "sing", "coll", "pauc", "f", "fpl"]
              for param in params_to_check:
                check_for_al(param)
              for i in xrange(2, 10):
                check_for_al("head%s" % i)
                for param in params_to_check:
                  check_for_al("%s%s" % (param, i))
              # Also remove al- from transliteration
              def check_for_al_tr(param):
                value = getparam(headword_template, param)
                if value:
                  putp(param, remove_al_tr(value))
              check_for_al("tr")
              for param in params_to_check:
                check_for_al("%str" % param)
              for i in xrange(2, 10):
                check_for_al("tr%s" % i)
                for param in params_to_check:
                  check_for_al("%s%str" % (param, i))
            elif is_proper:
              if head.endswith(ALIF):
                pagemsg(u"Headword template head %s ends in -ā" % (head))
                putp("state", "ind-def")
              else:
                pagemsg(u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head))
                continue

            if head.endswith(UN):
              pagemsg("Headword template head %s ends with explicit i3rab (UN)" % (head))
              add_note("head has explicit i3rab (UN)")
              # We don't continue here because we handle this case below
            elif head.endswith(U):
              pagemsg("Headword template head %s ends with explicit i3rab (U)" % (head))
              add_note("head has explicit i3rab (U)")
              # We don't continue here because we don't need to handle this case

            # Now check if the lemma is plural
            if re.match(r"\bp\b", getp("2")):
              pagemsg("Headword template head %s is plural" % (head))
              add_note("plural lemma")
              if getp("tr"):
                # FIXME (doesn't occur though)
                pagemsg("WARNING: Headword template head %s has manual translit and is plural, skipping" % (head))
                continue
              putp("pl", getp("1"))
              putp("1", "-")

          # Now fetch the parameters from the headword template, removing
          # any that we want to remove, removing the i3rab -UN ending, and
          # adding any specified manual translit as a / annotation.

          def param_should_be_removed(param):
            name = unicode(param.name)
            if name == "sc" and unicode(param.value) == "Arab":
              return True
            if name.endswith("tr"):
              return True
            for remove in removeparams:
              if name == remove:
                return True
              if re.match("^[a-z]+$", remove) and re.match("^%s([0-9]+)?$" % remove, name):
                return True
            return False

          def remove_i3rab(param):
            text = unicode(param)
            if text.endswith(UN):
              pgmsg("Removing i3rab from %s: %s" % (text,
                unicode(headword_template)))
              add_note("removing i3rab")
            return re.sub(UN + "$", "", text)

          def trparam(name):
            if name == "1":
              return "tr"
            elif name.startswith("head"):
              return name.replace("head", "tr")
            else:
              return name + "tr"

          def process_param(param):
            arabic = remove_i3rab(param)
            # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in
            # to signal the strong plural.
            if arabic.endswith("=+"):
              newarabic = re.sub(r"=\+$", "=sp", arabic)
              pgmsg("Converting %s to %s: %s" % (arabic,
                newarabic, unicode(headword_template)))
              arabic = newarabic
            # Value of - is used in ar-adj-in to signal an unknown
            # feminine plural.
            if arabic.endswith("=-"):
              newarabic = re.sub(r"=-$", "=?", arabic)
              pgmsg("Converting %s to %s: %s" % (arabic,
                newarabic, unicode(headword_template)))
              arabic = newarabic
            # Don't process translit in modifier constructions, where the
            # translit is also processed.
            if not headspace:
              tr = getparam(headword_template, trparam(unicode(param.name)))
              if tr:
                return arabic + "/" + tr
            return arabic

          params = '|'.join([process_param(param) for param in headword_template.params if not param_should_be_removed(param)])
          # For templates that automatically supply the masculine plural,
          # supply it here, too if not overridden.
          if tempname in ["ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an"] and not getp("pl"):
            params += '|pl=sp'

          # Separate off any [[Category: Foo]] declarators, insert before them
          m = re.match(r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
              subsections[j], re.S)
          if m:
            body = m.group(1)
            tail = m.group(2)
          else:
            body = subsections[j]
            tail = ""
          # Make sure there are two trailing newlines
          if body.endswith("\n\n"):
            pass
          elif body.endswith("\n"):
            body += "\n"
          else:
            body += "\n\n"
          body += (subsections[j - 1].replace(pos, "=Declension=") +
              "{{%s|%s}}\n\n" % (decltempname, params))
          subsections[j] = body + tail
          comment = "added declension for %s %s" % (tempname,
            remove_links(orighead) or "%s/%s" % (pagename, getp("tr")))
          note = ', '.join(notes)
          if note:
            comment = "%s (%s)" % (comment, note)
          comments.append(comment)
          sections[seci] = ''.join(subsections) + sectail
  newtext = pagehead + ''.join(sections) + pagetail
  comment = '; '.join(comments)
  assert((not comment) == (newtext == page.text))
  if newtext != page.text:
    if verbose:
      msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext))
    page.text = newtext
    msg("For page %s, comment = %s" % (pagename, comment))
    if save:
      page.save(comment = comment)