コード例 #1
0
def process_param(pagetitle, index, template, param, paramtr,
    include_tempname_in_changelog=False):
  def pagemsg(text):
    msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param,
      text))
  arabic = getparam(template, param)
  latin = getparam(template, paramtr)
  if include_tempname_in_changelog:
    paramtrname = "%s.%s" % (template.name, paramtr)
  else:
    paramtrname = paramtr
  if not arabic:
    return False
  if latin == "-":
    pagemsg("Translit is '-', skipping")
    return True
  if latin:
    try:
      _, canonlatin = tr_matching(arabic, latin, True, pagemsg)
      if not canonlatin:
        pagemsg("Unable to match-canonicalize %s (%s)" % (arabic, latin))
    except Exception as e:
      pagemsg("Trying to match-canonicalize %s (%s): %s" % (arabic, latin, e))
      canonlatin = None
    try:
      translit = ar_translit.tr(arabic)
      if not translit:
        pagemsg("Unable to auto-translit %s" % arabic)
    except Exception as e:
      pagemsg("Trying to transliterate %s: %s" % (arabic, e))
      translit = None
    if translit and canonlatin:
      if translit == canonlatin:
      #if (translit == canonlatin or
      #    translit == canonlatin + "un" or
      #    translit == u"ʾ" + canonlatin or
      #    translit == u"ʾ" + canonlatin + "un"):
        pagemsg("Removing redundant translit for %s (%s)" % (arabic, latin))
        oldtempl = "%s" % unicode(template)
        template.remove(paramtr)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["remove redundant %s=%s" % (paramtrname, latin)]
      else:
        pagemsg("Auto-translit for %s (%s) not same as manual translit %s (canonicalized %s)" %
            (arabic, translit, latin, canonlatin))
    if canonlatin:
      if latin != canonlatin:
        pagemsg("Match-canonicalizing Latin %s to %s" % (latin, canonlatin))
        oldtempl = "%s" % unicode(template)
        addparam(template, paramtr, canonlatin)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["match-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
      return True
    canonlatin, _ = ar_translit.canonicalize_latin_arabic(latin, None)
    if latin != canonlatin:
      pagemsg("Self-canonicalizing Latin %s to %s" % (latin, canonlatin))
      oldtempl = "%s" % unicode(template)
      addparam(template, paramtr, canonlatin)
      msg("Page %s %s: Replaced %s with %s" %
          (index, pagetitle, oldtempl, unicode(template)))
      return ["self-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
  return True
コード例 #2
0
 def __init__(
     self,
     eastarabnum,
     english,
     nom,
     femnom=None,
     obl=None,
     femobl=None,
     hundred=None,
     thousand=None,
     ord=None,
     adv=None,
     frac=None,
     dist=None,
     mult=None,
     numadj=None,
 ):
     nom = reorder_shadda(nom)
     if not femnom:
         if nom.endswith(AH):
             femnom = re.sub(AH + "$", "", nom)
         else:
             femnom = nom
     if not obl:
         if nom.endswith(UUN):
             obl = re.sub(UUN + "$", IIN, nom)
         else:
             obl = nom
     femnom = reorder_shadda(femnom)
     if not femobl:
         if femnom.endswith(UUN):
             femobl = re.sub(UUN + "$", IIN, femnom)
         else:
             femobl = femnom
     self.eastarabnum = eastarabnum
     self.english = english
     self.nom = nom
     self.nomtr = ar_translit.tr(nom)
     self.femnom = femnom
     self.femnomtr = ar_translit.tr(femnom)
     self.obl = obl
     self.obltr = ar_translit.tr(obl)
     self.femobl = femobl
     self.femobltr = ar_translit.tr(femobl)
     self.hundred = hundred
     self.thousand = thousand
     self.thousandtr = None
     if self.thousand:
         self.thousandtr = ar_translit.tr(self.thousand)
     self.ordroot = None
     self.ordeng = None
     self.ordgloss = None
     self.cardteeneng = None
     self.ordlemma = None
     self.cardteen = None
     self.adv = adv
     self.frac = frac
     self.dist = dist
     self.mult = mult
     self.numadj = numadj
     if ord:
         if len(ord) == 6:
             self.ordroot, self.ordeng, self.ordgloss, self.cardteeneng, self.ordlemma, self.cardteen = ord
         else:
             self.ordroot, self.ordeng, self.cardteeneng = ord
             self.ordgloss = self.ordeng
             self.ordlemma = self.ordroot[0] + AA + self.ordroot[1] + I + self.ordroot[2]
             self.cardteen = self.nom + A + u" عَشَرَ"
         self.ordlemmatr = ar_translit.tr(self.ordlemma)
         R1 = self.ordroot[0]
         R2 = self.ordroot[1]
         R3 = self.ordroot[2]
         self.femordlemma = R1 + AA + R2 + I + R3 + AH
         self.femordlemmatr = ar_translit.tr(self.femordlemma)
         self.ordteen = R1 + AA + R2 + I + R3 + A + u" عَشَرَ"
         self.femordteen = self.femordlemma + A + u" عَشْرَةَ"
         self.ordteeneng = (
             "twelfth"
             if self.cardteeneng == "twelve"
             else "twentieth"
             if self.cardteeneng == "twenty"
             else self.cardteeneng + "th"
         )
         # one and two are totally special-cased
         if self.english != "one" and self.english != "two":
             self.frac = R1 + U + R2 + SK + R3
             self.adv = self.femnom + u" مَرَّات"
             self.mult = u"مُ" + R1 + A + R2 + SH + A + R3
             self.numadj = R1 + U + R2 + AA + R3 + IY + SH
             if not self.hundred:
                 self.hundred = self.femnom + U + u"مِائَة"
             if not self.thousand:
                 self.thousand = self.nom + " " + self.thousandpl
                 self.thousandtr = self.nomtr + "t " + self.thousandpltr
         self.hundredtr = ar_translit.tr(self.hundred.replace(u"مِا", u"مِ"))
コード例 #3
0
def create_declension(page, index, save, pos, tempname, decltempname, sgnum,
    removeparams, is_proper=False):
  pagename = page.title()
  comments = []

  def pgmsg(text):
    msg("Page %s %s: %s" % (index, pagename, text))

  # Starts with definite article al-
  def starts_with_al(text):
    return re.match(ALIF_ANY + A + "?" + L, text)

  def sub_if(fr, to, text):
    if re.search(fr, text):
      return re.sub(fr, to, text)
    else:
      return ""

  # Remove definite article al- from text
  def remove_al(text):
    return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text)
        or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text)
        or text)

  # Remove definite article al- from transliterated text
  def remove_al_tr(text):
    return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or
        sub_if("^a?l-", "", text) or
        text)

  # Split off interwiki links at end
  m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
      page.text, re.S)
  if m:
    pagebody = m.group(1)
    pagetail = m.group(2)
  else:
    pagebody = page.text
    pagetail = ""

  # Split top-level sections (by language)
  splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M)

  # Extract off head and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Look for Arabic section
  for seci in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[seci], re.M)
    if not m:
      pgmsg("Can't find language name in text: [[%s]]" % (sections[seci]))
    elif m.group(1) == "Arabic":
      # Extract off trailing separator
      mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S)
      if mm:
        secbody = mm.group(1)
        sectail = mm.group(2)
      else:
        secbody = sections[seci]
        sectail = ""

      # Split into subsections based on headers
      subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

      # Go through each subsection
      for j in xrange(len(subsections)):
        notes = []

        def add_note(note):
          if note not in notes:
            notes.append(note)

        # Look for subsections matching the given POS
        if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]):
          # Call reorder_shadda here so the templates we work with have
          # shadda in correct order but we don't mess with other text to
          # avoid unnecessary saving
          parsed = blib.parse_text(reorder_shadda(subsections[j]))

          def pagemsg(text):
            pgmsg("%s: [[%s]]" % (text, subsections[j]))

          # Check for various conditions causing us to skip this entry and
          # not try to add a declension table

          # Skip declension if certain templates found in definition.
          # We don't check for {{alternative form of|...}}, because it's
          # used for e.g. different ways of spelling "camera" in Arabic,
          # some with -ā and some with -a, so we still want to create
          # declensions for those.
          altspelling_templates = [temp for temp in parsed.filter_templates() if temp.name in
              ["alternative spelling of"]]
          if len(altspelling_templates) > 0:
            pagemsg("Alternative spelling redirect found in text, skipping")
            continue
          if pos == "Adjective":
            feminine_of_templates = [temp for temp in parsed.filter_templates() if temp.name in
                ["feminine of"]]
            if len(feminine_of_templates) > 0:
              pagemsg("feminine-of template found for adjective, skipping")
              continue

          # Retrieve headword_template, make sure exactly one and it is the right type
          headword_templates = [temp for temp in parsed.filter_templates() if temp.name in
              ["ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun",
                "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl",
                "ar-noun-inf-cons", "ar-adj-inf-def",
                "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba",
                "ar-adj-sound", "ar-adj-in", "ar-adj-an"]]
          if len(headword_templates) == 0:
            pagemsg("WARNING: Can't find headword template in text, skipping")
            continue
          if len(headword_templates) > 1:
            pagemsg("WARNING: Found multiple headword templates in text, skipping")
            continue
          headword_template = headword_templates[0]
          if headword_template.name != tempname:
            pagemsg("Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name))
            continue
          def getp(param):
            return getparam(headword_template, param)
          # NOTE: We physically add and remove parameters from the headword
          # template to get the list of parameters to use in creating the
          # declension template. These changes don't get propagated to the
          # headword template because we don't convert the parsed text back
          # to a string.
          def putp(param, value):
            addparam(headword_template, param, value)
          head = getp("1")
          orighead = head

          # Check for declension already present
          if (j + 1 < len(subsections) and
              re.match("^===+Declension===+\n", subsections[j + 1])
              or j + 3 < len(subsections) and
              re.match("^===+Usage", subsections[j + 1]) and
              re.match("^===+Declension===+\n", subsections[j + 3])
              ):
            pagemsg("Declension already found for head %s, skipping" % head)
            continue

          # Check for cpl
          # FIXME: Convert cpl into pl and fpl
          if getp("cpl"):
            pagemsg("WARNING: Headword template for head %s has cpl param in it, skipping" % (head))
            continue

          # Check for empty head. If w/o explicit translit, skip; else,
          # fetch head from page title.
          if not head:
            if not getp("tr"):
              pagemsg("WARNING: Headword template head is empty and without explicit translit, skipping")
              continue
            else:
              pagemsg("Headword template head is empty but has explicit translit")
              add_note("empty head, using page name")
            head = pagename
            putp("1", head)

          # Try to handle cases with a modifier; we can't handle all of them yet
          headspace = False
          if ' ' in head:
            headspace = True
            words = re.split(r"\s", remove_links(head))
            head = words[0]
            if len(words) > 2:
              pagemsg("WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead)
              continue
            assert(len(words) == 2)

            # Check for params we don't yet know how to handle
            must_continue = False
            for badparam in ["pl2", "pltr", "head2", "sing", "coll"]:
              if getp(badparam):
                # FIXME
                pagemsg("WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam))
                must_continue = True
                break
            if must_continue:
              continue

            # Now check for various types of construction, all either
            # construct (ʾidāfa) or adjectival

            def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext):
              if word.endswith(undia):
                pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word))
                add_note("removing %s i3rab (%s)" % (nomgen, undiatext))
                return re.sub(undia + "$", "", word)
              if word.endswith(udia):
                pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word))
                add_note("removing %s i3rab (%s)" % (nomgen, udiatext))
                return re.sub(udia + "$", "", word)
              if re.search(DIACRITIC_ANY_BUT_SH + "$", word):
                pagemsg("WARNING: Strange diacritic at end of %s %s" % (nomgen, word))
              if word[0] == ALIF_WASLA:
                pagemsg("Changing %s alif wasla to plain alif for %s" % (nomgen, word))
                add_note("changing %s alif wasla to plain alif" % (nomgen))
                word = ALIF + word[1:]
              return word

            def remove_gen_i3rab(word):
              return remove_nom_gen_i3rab(word, "genitive", IN, "IN", I, "I")

            def remove_nom_i3rab(word):
              return remove_nom_gen_i3rab(word, "nominative", UN, "UN", U, "U")

            def remove_gen_i3rab_tr(word):
              return remove_nom_gen_i3rab(word, "genitive", "in", "in", "i", "i")

            def remove_nom_i3rab_tr(word):
              return remove_nom_gen_i3rab(word, "nominative", "un", "un", "u", "u")

            idafa = False
            word0al = starts_with_al(words[0])
            word1al = starts_with_al(words[1])
            words[0] = remove_al(words[0])
            words[1] = remove_al(words[1])
            putp("1", words[0])
            putp("mod", words[1])
            if word0al and word1al:
              pagemsg("Headword template head %s has space in it and found definite adjective construction" % (orighead))
              add_note("modifier definite adjective construction")
              putp("state", "def")
            elif word0al and not word1al:
              pagemsg("WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead))
              continue
            elif is_proper:
              if words[0].endswith(ALIF) and word1al:
                pagemsg("Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead))
                add_note("modifier proper noun + definite adjective construction")
                putp("state", "ind-def")
              elif remove_diacritics(words[0]) == u"جمهورية":
                if word1al:
                  pagemsg("Proper noun headword template head %s has space in it and found definite idafa" % (orighead))
                  add_note("modifier definite idafa construction")
                  idafa = True
                  assert sgnum == "sg"
                  idafaval = "def"
                  putp("idafa", idafaval)
                elif words[1].endswith(ALIF):
                  pagemsg("Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead))
                  add_note("modifier proper-noun ind-def idafa construction")
                  assert sgnum == "sg"
                  idafaval = "ind-def"
                  putp("idafa", idafaval)
                else:
                  pagemsg("WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead))
                  continue
              else:
                  pagemsg("WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead))
                  continue

            elif not word0al and word1al:
              # Found an ʾidāfa construction
              pagemsg("Headword template head %s has space in it and found definite idafa" % (orighead))
              add_note("modifier definite idafa construction")
              idafa = True
              idafaval = "def-" + sgnum
              if idafaval == "def-sg":
                idafaval = "def"
              putp("idafa", idafaval)
            elif words[1].endswith(I + Y):
              pagemsg("WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead))
              continue
            elif words[1].endswith(I + Y + SH):
              pagemsg("Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead))
              add_note("modifier indefinite nisba adjective construction")
            elif pagename in adjectival_phrases:
              pagemsg("Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead))
              add_note("modifier indefinite adjective construction")
            else:
              pagemsg("Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead))
              add_note("modifier indefinite idafa construction")
              idafa = True
              putp("idafa", sgnum)

            # Now remove any i3rab diacritics
            putp("1", remove_nom_i3rab(getp("1")))
            if idafa:
              putp("mod", remove_gen_i3rab(getp("mod")))
            else:
              putp("mod", remove_nom_i3rab(getp("mod")))

            # Now check if the lemma is plural
            if re.match(r"\bp\b", getp("2")):
              pagemsg("Headword template head %s has space in it and is plural" % (orighead))
              add_note("plural lemma")
              if getp("tr"):
                # FIXME (doesn't occur though)
                pagemsg("WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead))
                continue
              putp("pl", getp("1"))
              putp("1", "-")
              if not idafa:
                putp("modpl", getp("mod"))
                putp("mod", "-")

            # Now check if lemma has plural specified
            elif getp("pl"):
              pls = re.split(r"\s", remove_links(getp("pl")))
              assert(len(pls) == 2)
              pls[0] = remove_al(pls[0])
              pls[1] = remove_al(pls[1])
              putp("pl", remove_nom_i3rab(pls[0]))
              if not idafa:
                putp("modpl", remove_nom_i3rab(pls[1]))
              else:
                if pls[1] != getp("mod"):
                  pagemsg("FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod")))

            # Now check if there's manual translit. We need to split the
            # manual translit in two and pair up manual translit with
            # corresponding Arabic words. But first remove -t indicating
            # construct state, and check to see if manual translit is
            # same as auto translit, in which case it's unnecessary.
            if getp("tr"):
              pagemsg("Headword template head %s has space in it and manual translit" % (orighead))
              trwords = re.split(r"\s", getp("tr"))
              assert(len(trwords) == 2)
              trwords[0] = remove_nom_i3rab_tr(remove_al_tr(trwords[0]))
              if idafa:
                trwords[1] = remove_gen_i3rab_tr(remove_al_tr(trwords[1]))
              else:
                trwords[1] = remove_nom_i3rab_tr(remove_al_tr(trwords[1]))
              # Remove any extraneous -t from translit, either from construct
              # state of from removal of i3rab in a feminine noun/adj.
              for i in [0, 1]:
                if words[i].endswith(TAM) and trwords[i].endswith("t"):
                  trwords[i] = trwords[i][0:-1]
                if words[i].endswith(ALIF + TAM) and not trwords[i].endswith("h"):
                  trwords[i] += "h"
              if ar_translit.tr(words[0]) != trwords[0]:
                pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0]))
                add_note("modified head w/manual translit")
                putp("1", "%s/%s" % (getp("1"), trwords[0]))
              else:
                pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0]))
                add_note("modified head w/ignored manual translit")
              if ar_translit.tr(words[1]) != trwords[1]:
                pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1]))
                add_note("modifier w/manual translit")
                putp("mod", "%s/%s" % (getp("mod"), trwords[1]))
              else:
                pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1]))
                add_note("modifier w/ignored manual translit")

          else:
            # no space in head, not dealing with a modifier

            # If has link in it, just remove it
            if '[' in head or ']' in head or '|' in head:
              pagemsg("Headword template head %s has link in it" % (head))
              add_note("removed links from head")
              head = remove_links(head)
              putp("1", head)

            # If starts with definite article, remove article from everything,
            # including transliterations, and set state=def
            if starts_with_al(head):
              pagemsg("Headword template head %s starts with definite article" % (head))
              add_note("definite lemma")
              head = remove_al(head)
              putp("1", head)
              putp("state", "def")
              # Also remove al- from remaining head and pl params
              def check_for_al(param):
                param = remove_links(param)
                value = getparam(headword_template, param)
                if value:
                  if '[' in value or ']' in value or '|' in value:
                    pagemsg("Param %s value %s has link in it" % (param, value))
                    add_note("removed links from %s" % param)
                    value = remove_links(value)
                  putp(param, remove_al(value))
              params_to_check = ["pl", "sing", "coll", "pauc", "f", "fpl"]
              for param in params_to_check:
                check_for_al(param)
              for i in xrange(2, 10):
                check_for_al("head%s" % i)
                for param in params_to_check:
                  check_for_al("%s%s" % (param, i))
              # Also remove al- from transliteration
              def check_for_al_tr(param):
                value = getparam(headword_template, param)
                if value:
                  putp(param, remove_al_tr(value))
              check_for_al("tr")
              for param in params_to_check:
                check_for_al("%str" % param)
              for i in xrange(2, 10):
                check_for_al("tr%s" % i)
                for param in params_to_check:
                  check_for_al("%s%str" % (param, i))
            elif is_proper:
              if head.endswith(ALIF):
                pagemsg(u"Headword template head %s ends in -ā" % (head))
                putp("state", "ind-def")
              else:
                pagemsg(u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head))
                continue

            if head.endswith(UN):
              pagemsg("Headword template head %s ends with explicit i3rab (UN)" % (head))
              add_note("head has explicit i3rab (UN)")
              # We don't continue here because we handle this case below
            elif head.endswith(U):
              pagemsg("Headword template head %s ends with explicit i3rab (U)" % (head))
              add_note("head has explicit i3rab (U)")
              # We don't continue here because we don't need to handle this case

            # Now check if the lemma is plural
            if re.match(r"\bp\b", getp("2")):
              pagemsg("Headword template head %s is plural" % (head))
              add_note("plural lemma")
              if getp("tr"):
                # FIXME (doesn't occur though)
                pagemsg("WARNING: Headword template head %s has manual translit and is plural, skipping" % (head))
                continue
              putp("pl", getp("1"))
              putp("1", "-")

          # Now fetch the parameters from the headword template, removing
          # any that we want to remove, removing the i3rab -UN ending, and
          # adding any specified manual translit as a / annotation.

          def param_should_be_removed(param):
            name = unicode(param.name)
            if name == "sc" and unicode(param.value) == "Arab":
              return True
            if name.endswith("tr"):
              return True
            for remove in removeparams:
              if name == remove:
                return True
              if re.match("^[a-z]+$", remove) and re.match("^%s([0-9]+)?$" % remove, name):
                return True
            return False

          def remove_i3rab(param):
            text = unicode(param)
            if text.endswith(UN):
              pgmsg("Removing i3rab from %s: %s" % (text,
                unicode(headword_template)))
              add_note("removing i3rab")
            return re.sub(UN + "$", "", text)

          def trparam(name):
            if name == "1":
              return "tr"
            elif name.startswith("head"):
              return name.replace("head", "tr")
            else:
              return name + "tr"

          def process_param(param):
            arabic = remove_i3rab(param)
            # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in
            # to signal the strong plural.
            if arabic.endswith("=+"):
              newarabic = re.sub(r"=\+$", "=sp", arabic)
              pgmsg("Converting %s to %s: %s" % (arabic,
                newarabic, unicode(headword_template)))
              arabic = newarabic
            # Value of - is used in ar-adj-in to signal an unknown
            # feminine plural.
            if arabic.endswith("=-"):
              newarabic = re.sub(r"=-$", "=?", arabic)
              pgmsg("Converting %s to %s: %s" % (arabic,
                newarabic, unicode(headword_template)))
              arabic = newarabic
            # Don't process translit in modifier constructions, where the
            # translit is also processed.
            if not headspace:
              tr = getparam(headword_template, trparam(unicode(param.name)))
              if tr:
                return arabic + "/" + tr
            return arabic

          params = '|'.join([process_param(param) for param in headword_template.params if not param_should_be_removed(param)])
          # For templates that automatically supply the masculine plural,
          # supply it here, too if not overridden.
          if tempname in ["ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an"] and not getp("pl"):
            params += '|pl=sp'

          # Separate off any [[Category: Foo]] declarators, insert before them
          m = re.match(r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$",
              subsections[j], re.S)
          if m:
            body = m.group(1)
            tail = m.group(2)
          else:
            body = subsections[j]
            tail = ""
          # Make sure there are two trailing newlines
          if body.endswith("\n\n"):
            pass
          elif body.endswith("\n"):
            body += "\n"
          else:
            body += "\n\n"
          body += (subsections[j - 1].replace(pos, "=Declension=") +
              "{{%s|%s}}\n\n" % (decltempname, params))
          subsections[j] = body + tail
          comment = "added declension for %s %s" % (tempname,
            remove_links(orighead) or "%s/%s" % (pagename, getp("tr")))
          note = ', '.join(notes)
          if note:
            comment = "%s (%s)" % (comment, note)
          comments.append(comment)
          sections[seci] = ''.join(subsections) + sectail
  newtext = pagehead + ''.join(sections) + pagetail
  comment = '; '.join(comments)
  assert((not comment) == (newtext == page.text))
  if newtext != page.text:
    if verbose:
      msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext))
    page.text = newtext
    msg("For page %s, comment = %s" % (pagename, comment))
    if save:
      page.save(comment = comment)
コード例 #4
0
def do_canon_param(pagetitle, index, template, fromparam, toparam, paramtr,
    arabic, latin, include_tempname_in_changelog=False):
  actions = []
  tname = unicode(template.name)
  def pagemsg(text):
    msg("Page %s %s: %s.%s: %s" % (index, pagetitle, tname, fromparam, text))

  if show_template:
    pagemsg("Processing %s" % (unicode(template)))

  if include_tempname_in_changelog:
    paramtrname = "%s.%s" % (tname, paramtr)
  else:
    paramtrname = paramtr

  if latin == "-":
    pagemsg("Latin is -, taking no action")
    return False, False, []

  # Compute canonarabic and canonlatin
  match_canon = False
  canonlatin = ""
  if latin:
    try:
      canonarabic, canonlatin = ar_translit.tr_matching(arabic, latin, True,
          msgfun=pagemsg)
      match_canon = True
    except Exception as e:
      pagemsg("NOTE: Unable to vocalize %s (%s): %s: %s" % (arabic, latin, e, unicode(template)))
      canonlatin, canonarabic = ar_translit.canonicalize_latin_arabic(latin,
          arabic, msgfun=pagemsg)
  else:
    _, canonarabic = ar_translit.canonicalize_latin_arabic(None, arabic,
        msgfun=pagemsg)

  newlatin = canonlatin == latin and "same" or canonlatin
  newarabic = canonarabic == arabic and "same" or canonarabic

  latintrtext = (latin or canonlatin) and " (%s -> %s)" % (latin, newlatin) or ""

  try:
    translit = ar_translit.tr(canonarabic, msgfun=pagemsg)
    if not translit:
      pagemsg("NOTE: Unable to auto-translit %s (canoned from %s): %s" %
          (canonarabic, arabic, unicode(template)))
  except Exception as e:
    pagemsg("NOTE: Unable to transliterate %s (canoned from %s): %s: %s" %
        (canonarabic, arabic, e, unicode(template)))
    translit = None

  show_diff_string = False
  if canonarabic == arabic:
    pagemsg("No change in Arabic %s%s" % (arabic, latintrtext))
    canonarabic = False
  else:
    if match_canon:
      operation="Vocalizing"
      actionop="vocalize"
    elif latin:
      operation="Cross-canoning"
      actionop="cross-canon"
      show_diff_string = True
    else:
      operation="Self-canoning"
      actionop="self-canon"
      show_diff_string = True
    if show_diff_string:
      diffmsg = " (%s)" % diff_string(arabic, canonarabic)
    else:
      diffmsg = ""
    pagemsg("%s Arabic %s -> %s%s%s: %s" % (operation, arabic, canonarabic,
      latintrtext, diffmsg, unicode(template)))
    if fromparam == toparam:
      actions.append("%s %s=%s -> %s" % (actionop, fromparam, arabic,
        canonarabic))
    else:
      actions.append("%s %s=%s -> %s=%s" % (actionop, fromparam, arabic,
        toparam, canonarabic))
    rdcanonarabic = ar_translit.remove_diacritics(canonarabic)
    rdarabic = ar_translit.remove_diacritics(arabic)
    if rdarabic != rdcanonarabic:
      msgs = []
      if "  " in rdarabic or rdarabic.startswith(" ") or rdarabic.endswith(" "):
        msgs.append("stray space")
      if re.search("[A-Za-z]", nfd_form(rdarabic)):
        msgs.append("Latin")
      if u"\u00A0" in rdarabic:
        msgs.append("NBSP")
      if re.search(u"[\u200E\u200F]", rdarabic):
        msgs.append("L2R/R2L")
      if u"ی" in rdarabic:
        msgs.append("Farsi Yeh")
      if u"ک" in rdarabic:
        msgs.append("Keheh")
      if re.search(u"[\uFB50-\uFDCF]", rdarabic):
        msgs.append("Arabic Pres-A")
      if re.search(u"[\uFDF0-\uFDFF]", rdarabic):
        msgs.append("Arabic word ligatures")
      if re.search(u"[\uFE70-\uFEFF]", rdarabic):
        msgs.append("Arabic Pres-B")
      diffmsg = diff_string(rdarabic, rdcanonarabic)

      pagemsg("NOTE: Without diacritics, old Arabic %s different from canon %s%s (%s): %s"
          % (arabic, canonarabic, msgs and " (in old: %s)" % ", ".join(msgs) or "",
            diffmsg, unicode(template)))

  if not latin:
    pass
  elif translit and (translit == canonlatin
      # or translit == canonlatin + "un" or
      #    translit == u"ʾ" + canonlatin or
      #    translit == u"ʾ" + canonlatin + "un"
      ):
    pagemsg("Removing redundant translit for %s -> %s%s" % (
        arabic, newarabic, latintrtext))
    actions.append("remove redundant %s=%s" % (paramtrname, latin))
    canonlatin = True
  else:
    if match_canon:
      operation="Match-canoning"
      passive="Match-canoned"
      actionop="match-canon"
    else:
      operation="Cross-canoning"
      passive="Cross-canoned"
      actionop="cross-canon"
    if translit:
      pagemsg("NOTE: %s Latin %s not same as auto-translit %s, can't remove: %s" %
          (passive, canonlatin, translit, unicode(template)))
    if canonlatin == latin:
      pagemsg("No change in Latin %s: Arabic %s -> %s (auto-translit %s)" %
          (latin, arabic, newarabic, translit))
      canonlatin = False
    else:
      pagemsg("%s Latin %s -> %s: Arabic %s -> %s (auto-translit %s): %s" % (
          operation, latin, canonlatin, arabic, newarabic, translit,
          unicode(template)))
      actions.append("%s %s=%s -> %s" % (actionop, paramtrname, latin,
        canonlatin))

  return (canonarabic, canonlatin, actions)