Esempio n. 1
0
 def do_one_page_verb(page, index, text):
   pagename = page.title()
   verbcount = 0
   verbids = []
   for template in text.filter_templates():
     if template.name == "ar-conj":
       verbcount += 1
       vnvalue = getparam(template, "vn")
       uncertain = False
       if vnvalue.endswith("?"):
         vnvalue = vnvalue[:-1]
         msg("Page %s %s: Verbal noun(s) identified as uncertain" % (
           index, pagename))
         uncertain = True
       if not vnvalue:
         continue
       vns = re.split(u"[,،]", vnvalue)
       form = getparam(template, "1")
       verbid = "#%s form %s" % (verbcount, form)
       if re.match("^[1I](-|$)", form):
         verbid += " (%s,%s)" % (getparam(template, "2"), getparam(template, "3"))
       no_i3rab_vns = []
       for vn in vns:
         no_i3rab_vns.append(remove_i3rab(pagename, index, verbid, vn))
       newvn = ",".join(no_i3rab_vns)
       if uncertain:
         newvn += "?"
       if newvn != vnvalue:
         msg("Page %s %s: Verb %s, replacing %s with %s" % (
           index, pagename, verbid, vnvalue, newvn))
         addparam(template, "vn", newvn)
         verbids.append(verbid)
   return text, "Remove i3rab from verbal nouns for verb(s) %s" % (
         ', '.join(verbids))
Esempio n. 2
0
def canon_param(pagetitle,
                index,
                template,
                param,
                paramtr,
                include_tempname_in_changelog=False):
    if isinstance(param, list):
        fromparam, toparam = param
    else:
        fromparam, toparam = (param, param)
    arabic = (pagetitle if fromparam == "page title" else getparam(
        template, fromparam))
    latin = getparam(template, paramtr)
    if not arabic:
        return False
    canonarabic, canonlatin, actions = do_canon_param(
        pagetitle, index, template, fromparam, toparam, paramtr, arabic, latin,
        include_tempname_in_changelog)
    oldtempl = "%s" % unicode(template)
    if canonarabic:
        addparam(template, toparam, canonarabic)
    if canonlatin == True:
        template.remove(paramtr)
    elif canonlatin:
        addparam(template, paramtr, canonlatin)
    if canonarabic or canonlatin:
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
    return actions
Esempio n. 3
0
def canon_param(pagetitle, index, template, lang, param, paramtr,
                translit_module):
    if isinstance(param, list):
        fromparam, toparam = param
    else:
        fromparam, toparam = (param, param)
    foreign = (pagetitle if fromparam == "page title" else getparam(
        template, fromparam))
    latin = getparam(template, paramtr)
    if not foreign:
        return False
    canonforeign, canonlatin, actions = do_canon_param(pagetitle, index,
                                                       template, lang,
                                                       fromparam, toparam,
                                                       paramtr, foreign, latin,
                                                       translit_module)
    oldtempl = "%s" % unicode(template)
    if canonforeign:
        add_param_handling_head(template, toparam, canonforeign)
    if canonlatin == True:
        template.remove(paramtr)
    elif canonlatin:
        addparam(template, paramtr, canonlatin)
    if canonforeign or canonlatin:
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
    return actions
Esempio n. 4
0
    def canonicalize_one_page_verb_form(page, index, text):
        pagetitle = page.title()
        msg("Processing page %s" % pagetitle)
        actions_taken = []

        for template in text.filter_templates():
            if template.name == tempname:
                origtemp = unicode(template)
                form = getparam(template, formarg)
                if form:
                    addparam(template, formarg, canonicalize_form(form))
                newtemp = unicode(template)
                if origtemp != newtemp:
                    msg("Replacing %s with %s" % (origtemp, newtemp))
                if re.match("^[1I](-|$)", form):
                    actions_taken.append(
                        "form=%s (%s/%s)" %
                        (form, getparam(template, str(1 + int(formarg))),
                         getparam(template, str(2 + int(formarg)))))
                else:
                    actions_taken.append("form=%s" % form)
        changelog = "%s: canonicalize form (%s=) to Roman numerals: %s" % (
            tempname, formarg, '; '.join(actions_taken))
        if len(actions_taken) > 0:
            msg("Change log = %s" % changelog)
        return text, changelog
def rewrite_one_page_verb_headword(page, index, text):
  pagetitle = page.title()
  msg("Processing page %s" % pagetitle)
  actions_taken = []

  for template in text.filter_templates():
    if template.name in ["ar-verb"]:
      origtemp = unicode(template)
      form = getparam(template, "form")
      if form:
        # In order to keep in the same order, just forcibly change the
        # param "names" (numbers)
        for pno in xrange(10, 0, -1):
          if template.has(str(pno)):
            template.get(str(pno)).name = str(pno + 1)
        # Make sure form= param is first ...
        template.remove("form")
        addparam(template, "form", canonicalize_form(form), before=template.params[0].name if len(template.params) > 0 else None)
        # ... then forcibly change its name to 1=
        template.get("form").name = "1"
        template.get("1").showkey = False
      newtemp = unicode(template)
      if origtemp != newtemp:
        msg("Replacing %s with %s" % (origtemp, newtemp))
      if re.match("^[1I](-|$)", form):
        actions_taken.append("form=%s (%s/%s)" % (form,
          getparam(template, "2"), getparam(template, "3")))
      else:
        actions_taken.append("form=%s" % form)
  changelog = "ar-verb: form= -> 1= and canonicalize to Roman numerals, move other params up: %s" % '; '.join(actions_taken)
  if len(actions_taken) > 0:
    msg("Change log = %s" % changelog)
  return text, changelog
Esempio n. 6
0
def canon_param(pagetitle, index, template, param, paramtr, translit_module,
    include_tempname_in_changelog=False):
  if isinstance(param, list):
    fromparam, toparam = param
  else:
    fromparam, toparam = (param, param)
  foreign = (pagetitle if fromparam == "page title" else
    getparam(template, fromparam))
  latin = getparam(template, paramtr)
  if not foreign:
    return False
  canonforeign, canonlatin, actions = do_canon_param(pagetitle, index,
      template, fromparam, toparam, paramtr, foreign, latin, translit_module,
      include_tempname_in_changelog)
  oldtempl = "%s" % unicode(template)
  if canonforeign:
    addparam(template, toparam, canonforeign)
  if canonlatin == True:
    template.remove(paramtr)
  elif canonlatin:
    addparam(template, paramtr, canonlatin)
  if canonforeign or canonlatin:
    msg("Page %s %s: Replaced %s with %s" % (index, pagetitle,
      oldtempl, unicode(template)))
  return actions
Esempio n. 7
0
 def do_one_page_verb(page, index, text):
   pagename = page.title()
   verbcount = 0
   verbids = []
   for template in text.filter_templates():
     if template.name == "ar-conj":
       verbcount += 1
       vnvalue = getparam(template, "vn")
       uncertain = False
       if vnvalue.endswith("?"):
         vnvalue = vnvalue[:-1]
         msg("Page %s %s: Verbal noun(s) identified as uncertain" % (
           index, pagename))
         uncertain = True
       if not vnvalue:
         continue
       vns = re.split(u"[,،]", vnvalue)
       form = getparam(template, "1")
       verbid = "#%s form %s" % (verbcount, form)
       if re.match("^[1I](-|$)", form):
         verbid += " (%s,%s)" % (getparam(template, "2"), getparam(template, "3"))
       no_i3rab_vns = []
       for vn in vns:
         no_i3rab_vns.append(remove_i3rab(pagename, index, verbid, vn))
       newvn = ",".join(no_i3rab_vns)
       if uncertain:
         newvn += "?"
       if newvn != vnvalue:
         msg("Page %s %s: Verb %s, replacing %s with %s" % (
           index, pagename, verbid, vnvalue, newvn))
         addparam(template, "vn", newvn)
         verbids.append(verbid)
   return text, "Remove i3rab from verbal nouns for verb(s) %s" % (
         ', '.join(verbids))
  def canonicalize_one_page_verb_form(page, index, text):
    pagetitle = page.title()
    msg("Processing page %s" % pagetitle)
    actions_taken = []

    for template in text.filter_templates():
      if template.name == tempname:
        origtemp = unicode(template)
        form = getparam(template, formarg)
        if form:
          addparam(template, formarg, canonicalize_form(form))
        newtemp = unicode(template)
        if origtemp != newtemp:
          msg("Replacing %s with %s" % (origtemp, newtemp))
        if re.match("^[1I](-|$)", form):
          actions_taken.append("form=%s (%s/%s)" % (form,
            getparam(template, str(1+int(formarg))),
            getparam(template, str(2+int(formarg)))))
        else:
          actions_taken.append("form=%s" % form)
    changelog = "%s: canonicalize form (%s=) to Roman numerals: %s" % (
        tempname, formarg, '; '.join(actions_taken))
    if len(actions_taken) > 0:
      msg("Change log = %s" % changelog)
    return text, changelog
def process_template(pagetitle, index, template, ruparam, trparam, output_line,
    find_accents, verbose):
  origt = unicode(template)
  saveparam = ruparam
  def pagemsg(text):
    msg("Page %s %s: %s" % (index, pagetitle, text))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, semi_verbose)
  if semi_verbose:
    pagemsg("Processing template: %s" % unicode(template))
  if unicode(template.name) == "head":
    # Skip {{head}}. We don't want to mess with headwords.
    return False
  if isinstance(ruparam, list):
    ruparam, saveparam = ruparam
  if ruparam == "page title":
    val = pagetitle
  else:
    val = getparam(template, ruparam)
  valtr = getparam(template, trparam) if trparam else ""
  changed = False
  if find_accents:
    newval, newtr = find_accented(val, valtr, verbose, pagemsg, expand_text,
        origt)
    if newval != val or newtr != valtr:
      if ru.remove_accents(newval) != ru.remove_accents(val):
        pagemsg("WARNING: Accented page %s changed from %s in more than just accents, not changing" % (newval, val))
      else:
        changed = True
        addparam(template, saveparam, newval)
        if newtr:
          if not trparam:
            pagemsg("WARNING: Unable to change translit to %s because no translit param available (Cyrillic param %s): %s" %
                (newtr, saveparam, origt))
          elif unicode(template.name) in ["ru-ux"]:
            pagemsg("WARNING: Not changing or adding translit param %s=%s to ru-ux: origt=%s" % (
              trparam, newtr, origt))
          else:
            if valtr and valtr != newtr:
              pagemsg("WARNING: Changed translit param %s from %s to %s: origt=%s" %
                  (trparam, valtr, newtr, origt))
            if not valtr:
              pagemsg("NOTE: Added translit param %s=%s to template: origt=%s" %
                  (trparam, newtr, origt))
            addparam(template, trparam, newtr)
        elif valtr:
          pagemsg("WARNING: Template has translit %s but lookup result has none, leaving translit alone: origt=%s" %
              (valtr, origt))
        if check_need_accent(newval):
          output_line("Need accents (changed)")
        else:
          output_line("Found accents")
  if not changed and check_need_accent(val):
    output_line("Need accents")
  if changed:
    pagemsg("Replaced %s with %s" % (origt, unicode(template)))
  return ["auto-accent %s%s" % (newval, "//%s" % newtr if newtr else "")] if changed else False
Esempio n. 10
0
def rewrite_one_page_ar_nisba(page, index, text):
  for template in text.filter_templates():
    if template.name == "ar-nisba":
      if template.has("head") and not template.has(1):
        head = unicode(template.get("head").value)
        template.remove("head")
        addparam(template, "1", head, before=template.params[0].name if len(template.params) > 0 else None)
      if template.has("plhead"):
        blib.msg("%s has plhead=" % page.title())
  return text, "ar-nisba: head= -> 1="
Esempio n. 11
0
def add_param_handling_head(template, param, value):
    if param != "head":
        addparam(template, param, value)
        return
    before = None
    for paramobj in template.params:
        pname = unicode(paramobj.name).strip()
        if re.match("^[0-9]+", pname):
            continue
        before = pname
        break
    addparam(template, param, value, before=before)
Esempio n. 12
0
def fix(page, index, text):
  for template in text.filter_templates():
    if template.name in arabiclib.arabic_all_headword_templates:
      if template.has("head") and not template.has(1) and not template.has(2) and not template.has(3) and not template.has(4) and not template.has(5) and not template.has(6) and not template.has(7) and not template.has(8):
        head = unicode(template.get("head").value)
        template.remove("head")
        addparam(template, "head", head, before=template.params[0].name if len(template.params) > 0 else None)
 
        if template.params[0].name == "head":
          template.get("head").showkey = False
 
  return text, "ar headword: head= > 1="
Esempio n. 13
0
def rewrite_one_page_ar_nisba(page, index, text):
    for template in text.filter_templates():
        if template.name == "ar-nisba":
            if template.has("head") and not template.has(1):
                head = unicode(template.get("head").value)
                template.remove("head")
                addparam(template,
                         "1",
                         head,
                         before=template.params[0].name
                         if len(template.params) > 0 else None)
            if template.has("plhead"):
                blib.msg("%s has plhead=" % page.title())
    return text, "ar-nisba: head= -> 1="
Esempio n. 14
0
def vocalize_param(pagetitle, index, template, param, paramtr):
  arabic = getparam(template, param)
  latin = getparam(template, paramtr)
  if not arabic:
    return False
  if latin:
    vocalized = do_vocalize_param(pagetitle, index, template, param, arabic, latin)
    if vocalized:
      oldtempl = "%s" % unicode(template)
      addparam(template, param, vocalized)
      msg("Page %s %s: Replaced %s with %s" % (index, pagetitle,
        oldtempl, unicode(template)))
      return vocalized
  return True
Esempio n. 15
0
def vocalize_param(pagetitle, index, template, param, paramtr):
    arabic = getparam(template, param)
    latin = getparam(template, paramtr)
    if not arabic:
        return False
    if latin:
        vocalized = do_vocalize_param(pagetitle, index, template, param,
                                      arabic, latin)
        if vocalized:
            oldtempl = "%s" % unicode(template)
            addparam(template, param, vocalized)
            msg("Page %s %s: Replaced %s with %s" %
                (index, pagetitle, oldtempl, unicode(template)))
            return vocalized
    return True
 def fix_one_page_tool_place_noun(page, index, text):
   pagetitle = page.title()
   for t in text.filter_templates():
     if t.name == template:
       if getparam(t, "cap"):
         msg("Page %s %s: Template %s: Remove cap=" %
             (index, pagetitle, template))
         t.remove("cap")
       else:
         msg("Page %s %s: Template %s: Add lc=1" %
             (index, pagetitle, template))
         addparam(t, "lc", "1")
   changelog = "%s: If cap= is present, remove it, else add lc=" % template
   msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog))
   return text, changelog
Esempio n. 17
0
 def fix_one_page_tool_place_noun(page, index, text):
     pagetitle = page.title()
     for t in text.filter_templates():
         if t.name == template:
             if getparam(t, "cap"):
                 msg("Page %s %s: Template %s: Remove cap=" %
                     (index, pagetitle, template))
                 t.remove("cap")
             else:
                 msg("Page %s %s: Template %s: Add lc=1" %
                     (index, pagetitle, template))
                 addparam(t, "lc", "1")
     changelog = "%s: If cap= is present, remove it, else add lc=" % template
     msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog))
     return text, changelog
Esempio n. 18
0
def fix(page, index, text):
    for template in text.filter_templates():
        if template.name in arabiclib.arabic_all_headword_templates:
            if template.has("head") and not template.has(
                    1) and not template.has(2) and not template.has(
                        3) and not template.has(4) and not template.has(
                            5) and not template.has(6) and not template.has(
                                7) and not template.has(8):
                head = unicode(template.get("head").value)
                template.remove("head")
                addparam(template,
                         "head",
                         head,
                         before=template.params[0].name
                         if len(template.params) > 0 else None)

                if template.params[0].name == "head":
                    template.get("head").showkey = False

    return text, "ar headword: head= > 1="
Esempio n. 19
0
def rewrite_one_page_verb_headword(page, index, text):
    pagetitle = page.title()
    msg("Processing page %s" % pagetitle)
    actions_taken = []

    for template in text.filter_templates():
        if template.name in ["ar-verb"]:
            origtemp = unicode(template)
            form = getparam(template, "form")
            if form:
                # In order to keep in the same order, just forcibly change the
                # param "names" (numbers)
                for pno in xrange(10, 0, -1):
                    if template.has(str(pno)):
                        template.get(str(pno)).name = str(pno + 1)
                # Make sure form= param is first ...
                template.remove("form")
                addparam(template,
                         "form",
                         canonicalize_form(form),
                         before=template.params[0].name
                         if len(template.params) > 0 else None)
                # ... then forcibly change its name to 1=
                template.get("form").name = "1"
                template.get("1").showkey = False
            newtemp = unicode(template)
            if origtemp != newtemp:
                msg("Replacing %s with %s" % (origtemp, newtemp))
            if re.match("^[1I](-|$)", form):
                actions_taken.append(
                    "form=%s (%s/%s)" %
                    (form, getparam(template, "2"), getparam(template, "3")))
            else:
                actions_taken.append("form=%s" % form)
    changelog = "ar-verb: form= -> 1= and canonicalize to Roman numerals, move other params up: %s" % '; '.join(
        actions_taken)
    if len(actions_taken) > 0:
        msg("Change log = %s" % changelog)
    return text, changelog
Esempio n. 20
0
 def fix_one_page_smp(page, index, text):
     pagetitle = page.title()
     for t in text.filter_templates():
         head = reorder_shadda(getparam(t, "1"))
         if t.name.startswith("ar-decl-"):
             param = "pl"
             pl = getparam(t, param)
             i = 2
             while pl:
                 if pl == "smp":
                     if head.endswith(TAM):
                         msg("Page %s %s: WARNING: Found %s=smp with feminine ending head %s in %s: not changing"
                             % (index, pagetitle, param, head, t.name))
                     else:
                         msg("Page %s %s: Changing %s=smp to %s=sp in %s"
                             % (index, pagetitle, param, param, t.name))
                         addparam(t, param, "sp")
                 param = "pl%s" % i
                 pl = getparam(t, param)
                 i += 1
     changelog = "Change pl=smp to pl=sp"
     msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog))
     return text, changelog
Esempio n. 21
0
 def fix_one_page_smp(page, index, text):
   pagetitle = page.title()
   for t in text.filter_templates():
     head = reorder_shadda(getparam(t, "1"))
     if t.name.startswith("ar-decl-"):
       param = "pl"
       pl = getparam(t, param)
       i = 2
       while pl:
         if pl == "smp":
           if head.endswith(TAM):
             msg("Page %s %s: WARNING: Found %s=smp with feminine ending head %s in %s: not changing" % (
               index, pagetitle, param, head, t.name))
           else:
             msg("Page %s %s: Changing %s=smp to %s=sp in %s" % (
               index, pagetitle, param, param, t.name))
             addparam(t, param, "sp")
         param = "pl%s" % i
         pl = getparam(t, param)
         i += 1
   changelog = "Change pl=smp to pl=sp"
   msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog))
   return text, changelog
Esempio n. 22
0
def rewrite_one_page_ru_decl_noun(page, index, text):
  oldtemps = []
  pagename = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  nochange = False
  change = False
  for t in text.filter_templates():
    converted = True
    def tname():
      return unicode(t.name).strip()
    origname = tname()
    origtemplate = unicode(t)
    if tname() in ["ru-noun-table", "ru-noun-old"]:
      continue
    stem = ""
    bare = ""
    accsg = ""
    locsg = ""
    if tname() == u"ru-noun-ин":
      ustem = getrmparam(t, "1")
      stem = getrmparam(t, "2")
      full = getrmparam(t, "3")
      stem = stem or ustem
      declclass = u"ин"
      if stem + u"ин" == full:
        accentclass = "1"
      elif remove_diacritics(stem) + u"и́н" == full:
        accentclass = "4"
      elif stem == full:
        accentclass = "1"
        declclass = u"-е"
      else:
        pagemsg("WARNING: Can't locate accent class for template: %s" %
            origtemplate)
        nochange = True
        break
      change = True
    elif tname() == u"ru-noun-нок":
      ustem = getrmparam(t, "1")
      stem = getrmparam(t, "2")
      uplural = getrmparam(t, "3")
      plural = getrmparam(t, "4")
      stem = stem or ustem
      plural = plural or uplural
      accentclass = "2"
      if stem.endswith(u"ё"):
        declclass = u"ёнок"
        stem = re.sub(u"ё$", "", stem)
      elif stem.endswith(u"о́"):
        declclass = u"онок"
        stem = re.sub(u"о́$", "", stem)
      else:
        pagemsg("WARNING: Template stem ends weirdly: %s" % origtemplate)
        nochange = True
        break
      if stem != re.sub(u"(я́|а́)$", "", plural):
        pagemsg("WARNING: Strange plural: %s" % origtemplate)
        nochange = True
        break
      if (declclass == u"ёнок" and not plural.endswith(u"я́") or
          declclass == u"онок" and not plural.endswith(u"а́")):
        pagemsg("WARNING: Unexpected plural ending for stem: %s" % origtemplate)
        nochange = True
        break
      change = True
    elif tname() == u"ru-noun-vel-3":
      ustem = getrmparam(t, "1")
      stem = getrmparam(t, "2")
      bare = getrmparam(t, "3")
      locsg = getrmparam(t, "13")
      locpl = getrmparam(t, "14")
      stem = stem or ustem or bare or pagename
      declclass = ""
      accentclass = "3"
      if locpl and locpl != remove_diacritics(stem) + u"а́х":
        pagemsg("WARNING: Unexpected locative plural %s: %s" % (locpl,
          origtemplate))
        nochange = True
        break
      change = True
    else:
      for entry in ru_noun_transl:
        if len(entry) == 3:
          regex, declclass, directive = entry
          m = re.match(regex, tname())
          if not m:
            continue
          assert len(m.groups()) == 1
          accentclass = m.group(1)
        else:
          assert len(entry) == 4
          regex, declclass, directive, accentclass = entry
          m = re.match(regex, tname())
          if not m:
            continue
          assert len(m.groups()) == 0
        if directive == "stem":
          stem = getrmparam(t, "1")
        elif directive == "stem-bare":
          stem = getrmparam(t, "1")
          bare = getrmparam(t, "2")
        elif directive == "u-stem":
          ustem = getrmparam(t, "1")
          stem = getrmparam(t, "2")
          stem = stem or ustem
        elif directive == "u-stem-bare":
          ustem = getrmparam(t, "1")
          stem = getrmparam(t, "2")
          bare = getrmparam(t, "3")
          stem = stem or ustem or bare
        elif directive == "u-stem-pagename":
          ustem = getrmparam(t, "1")
          stem = getrmparam(t, "2")
          stem = stem or ustem or pagename
        elif directive == "u-stem-bare-pagename":
          ustem = getrmparam(t, "1")
          stem = getrmparam(t, "2")
          bare = getrmparam(t, "3")
          stem = stem or ustem or bare or pagename
        elif directive == "u-stem-u-bare":
          ustem = getrmparam(t, "1")
          stem = getrmparam(t, "2")
          ubare = getrmparam(t, "3")
          bare = getrmparam(t, "4")
          stem = stem or ustem
          bare = bare or ubare
        elif directive in ["u-stem-u-bare-irregpl", "u-stem-u-bare-irregpl-irregaccsg"]:
          ustem = getrmparam(t, "1")
          stem = getrmparam(t, "2")
          ubare = getrmparam(t, "3")
          bare = getrmparam(t, "4")
          irregpl = getrmparam(t, "5")
          stem = irregpl or stem or ustem
          bare = bare or ubare
          if directive == "u-stem-u-bare-irregpl-irregaccsg":
            accsg = getrmparam(t, "6")
        elif directive in ["u-stem-minus-i", "u-stem-u-bare-minus-i"]:
          ustem = getrmparam(t, "1")
          stem = getrmparam(t, "2")
          stem = stem or ustem
          unstressedi = u"и"
          stressedi = u"и́"
          assert len(stressedi) == 2
          if stem.endswith(unstressedi):
            stem = stem[0:-1]
          elif stem.endswith(stressedi):
            stem = stem[0:-2]
          else:
            pagemsg(u"WARNING: Stem %s doesn't end in и in %s, skipping" %
                (stem, unicode(t)))
            nochange = True
            break
        else:
          pagemsg("WARNING: Unknown directive %s, skipping" % directive)
          nochange = True
          break

        change = True
        break
      else:
        if re.match("^ru-noun-", tname()):
          pagemsg("Encountered unknown noun decl template %s" % unicode(t))
    if change:
      if not stem:
        pagemsg("WARNING: Can't locate stem in %s, skipping" % origtemplate)
        nochange = True
        break
      anim = getrmparam(t, "anim")
      if anim:
        anim = "an"
      n = getrmparam(t, "n")
      notes = getrmparam(t, "note")
      if len(t.params) > 0:
        pagemsg("WARNING: Extraneous parameters in %s, skipping" % unicode(t))
        nochange = True
        break
      addparam(t, "1", accentclass)
      addparam(t, "2", stem)
      addparam(t, "3", declclass)
      if bare:
        addparam(t, "4", bare)
      if acc_sg:
        addparam(t, "acc_sg", acc_sg)
      if loc_sg:
        addparam(t, "loc", loc_sg)
      if anim:
        addparam(t, "a", anim)
      if n:
        addparam(t, "n", n)
      if notes:
        addparam(t, "notes", notes)
      t.name = "ru-noun-table"
      pagemsg("Rewrote %s as %s" % (origtemplate, unicode(t)))
      oldtemps.append(origname)
  if nochange:
    return None, ""
  if oldtemps:
    comment = "convert %s -> ru-noun-table" % ", ".join(oldtemps)
  else:
    comment = None
  return text, comment
Esempio n. 23
0
def rewrite_one_page_ru_decl_adj(page, index, text):
  oldtemps = []
  pagename = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  for t in text.filter_templates():
    converted = True
    def tname():
      return unicode(t.name).strip()
    origname = tname()
    origtemplate = unicode(t)
    if tname() == "ru-adj-table":
      t.name = "ru-decl-adj"
    else:
      if re.match("^ru-adjective[0-9]", tname()):
        t.name = tname().replace("ru-adjective", "ru-adj")
      if tname() == "ru-passive participle decl":
        t.name = "ru-adj1"
      suffix = None
      if tname() == "ru-adj3-sja":
        suffix = u"ся"
        t.name = "ru-adj3"
      elif tname() == "ru-adj5-suffix":
        suffix = "-" + getparam(t, "8")
        t.name = "ru-adj5"
      if tname() in ending_for_ru_adj:
        if tname() == "ru-adj13":
          addparam(t, "2", ending_for_ru_adj[tname()])
          rmparam(t, "8")
          rmparam(t, "7")
          rmparam(t, "6")
          rmparam(t, "5")
          rmparam(t, "4")
          rmparam(t, "3")
        elif tname() in ["ru-adj7", "ru-adj8", "ru-adj9", "ru-adj12"]:
          addparam(t, "1", getparam(t, "2").strip())
          addparam(t, "2", ending_for_ru_adj[tname()])
          rmparam(t, "8")
          rmparam(t, "7")
          rmparam(t, "6")
          rmparam(t, "5")
          rmparam(t, "4")
          rmparam(t, "3")
        else:
          addparam(t, "1", getparam(t, "2").strip())
          addparam(t, "2", ending_for_ru_adj[tname()])
          mshort = clean(getparam(t, "3"))
          if mshort and re.search(u"[аяоеыи]$", remove_diacritics(mshort)):
            pagemsg("WARNING: short masculine %s doesn't have right ending" %
                mshort)
          fshort = clean(getparam(t, "4"))
          if fshort and not re.search(u"[ая]$", remove_diacritics(fshort)):
            pagemsg("WARNING: short feminine %s doesn't have right ending" %
                fshort)
          nshort = clean(getparam(t, "5"))
          if nshort and not re.search(u"[ое]$", remove_diacritics(nshort)):
            pagemsg("WARNING: short neuter %s doesn't have right ending" %
                nshort)
          pshort = clean(getparam(t, "6"))
          if pshort and not re.search(u"[ыи]$", remove_diacritics(pshort)):
            pagemsg("WARNING: short plural %s doesn't have right ending" %
                pshort)
          rmparam(t, "8")
          rmparam(t, "7")
          rmparam(t, "6")
          rmparam(t, "5")
          rmparam(t, "4")
          rmparam(t, "3")
          if mshort:
            addparam(t, "3", mshort)
          # Note that fshort and nshort get reversed
          if nshort:
            addparam(t, "4", nshort)
          if fshort:
            addparam(t, "5", fshort)
          if pshort:
            addparam(t, "6", pshort)
        if suffix:
          addparam(t, "suffix", suffix)
        t.name = "ru-decl-adj"
        pagemsg("Rewrote %s as %s" % (origtemplate, unicode(t)))
      else:
        converted = False
    if converted:
      oldtemps.append(origname)
  if oldtemps:
    comment = "convert %s -> ru-decl-adj" % ", ".join(oldtemps)
  else:
    comment = None
  return text, comment
Esempio n. 24
0
def rewrite_one_page_ru_decl_noun(page, index, text):
    oldtemps = []
    pagename = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagename, txt))

    nochange = False
    change = False
    for t in text.filter_templates():
        converted = True

        def tname():
            return unicode(t.name).strip()

        origname = tname()
        origtemplate = unicode(t)
        if tname() in ["ru-noun-table", "ru-noun-old"]:
            continue
        stem = ""
        bare = ""
        accsg = ""
        locsg = ""
        if tname() == u"ru-noun-ин":
            ustem = getrmparam(t, "1")
            stem = getrmparam(t, "2")
            full = getrmparam(t, "3")
            stem = stem or ustem
            declclass = u"ин"
            if stem + u"ин" == full:
                accentclass = "1"
            elif remove_diacritics(stem) + u"и́н" == full:
                accentclass = "4"
            elif stem == full:
                accentclass = "1"
                declclass = u"-е"
            else:
                pagemsg("WARNING: Can't locate accent class for template: %s" %
                        origtemplate)
                nochange = True
                break
            change = True
        elif tname() == u"ru-noun-нок":
            ustem = getrmparam(t, "1")
            stem = getrmparam(t, "2")
            uplural = getrmparam(t, "3")
            plural = getrmparam(t, "4")
            stem = stem or ustem
            plural = plural or uplural
            accentclass = "2"
            if stem.endswith(u"ё"):
                declclass = u"ёнок"
                stem = re.sub(u"ё$", "", stem)
            elif stem.endswith(u"о́"):
                declclass = u"онок"
                stem = re.sub(u"о́$", "", stem)
            else:
                pagemsg("WARNING: Template stem ends weirdly: %s" %
                        origtemplate)
                nochange = True
                break
            if stem != re.sub(u"(я́|а́)$", "", plural):
                pagemsg("WARNING: Strange plural: %s" % origtemplate)
                nochange = True
                break
            if (declclass == u"ёнок" and not plural.endswith(u"я́")
                    or declclass == u"онок" and not plural.endswith(u"а́")):
                pagemsg("WARNING: Unexpected plural ending for stem: %s" %
                        origtemplate)
                nochange = True
                break
            change = True
        elif tname() == u"ru-noun-vel-3":
            ustem = getrmparam(t, "1")
            stem = getrmparam(t, "2")
            bare = getrmparam(t, "3")
            locsg = getrmparam(t, "13")
            locpl = getrmparam(t, "14")
            stem = stem or ustem or bare or pagename
            declclass = ""
            accentclass = "3"
            if locpl and locpl != remove_diacritics(stem) + u"а́х":
                pagemsg("WARNING: Unexpected locative plural %s: %s" %
                        (locpl, origtemplate))
                nochange = True
                break
            change = True
        else:
            for entry in ru_noun_transl:
                if len(entry) == 3:
                    regex, declclass, directive = entry
                    m = re.match(regex, tname())
                    if not m:
                        continue
                    assert len(m.groups()) == 1
                    accentclass = m.group(1)
                else:
                    assert len(entry) == 4
                    regex, declclass, directive, accentclass = entry
                    m = re.match(regex, tname())
                    if not m:
                        continue
                    assert len(m.groups()) == 0
                if directive == "stem":
                    stem = getrmparam(t, "1")
                elif directive == "stem-bare":
                    stem = getrmparam(t, "1")
                    bare = getrmparam(t, "2")
                elif directive == "u-stem":
                    ustem = getrmparam(t, "1")
                    stem = getrmparam(t, "2")
                    stem = stem or ustem
                elif directive == "u-stem-bare":
                    ustem = getrmparam(t, "1")
                    stem = getrmparam(t, "2")
                    bare = getrmparam(t, "3")
                    stem = stem or ustem or bare
                elif directive == "u-stem-pagename":
                    ustem = getrmparam(t, "1")
                    stem = getrmparam(t, "2")
                    stem = stem or ustem or pagename
                elif directive == "u-stem-bare-pagename":
                    ustem = getrmparam(t, "1")
                    stem = getrmparam(t, "2")
                    bare = getrmparam(t, "3")
                    stem = stem or ustem or bare or pagename
                elif directive == "u-stem-u-bare":
                    ustem = getrmparam(t, "1")
                    stem = getrmparam(t, "2")
                    ubare = getrmparam(t, "3")
                    bare = getrmparam(t, "4")
                    stem = stem or ustem
                    bare = bare or ubare
                elif directive in [
                        "u-stem-u-bare-irregpl",
                        "u-stem-u-bare-irregpl-irregaccsg"
                ]:
                    ustem = getrmparam(t, "1")
                    stem = getrmparam(t, "2")
                    ubare = getrmparam(t, "3")
                    bare = getrmparam(t, "4")
                    irregpl = getrmparam(t, "5")
                    stem = irregpl or stem or ustem
                    bare = bare or ubare
                    if directive == "u-stem-u-bare-irregpl-irregaccsg":
                        accsg = getrmparam(t, "6")
                elif directive in ["u-stem-minus-i", "u-stem-u-bare-minus-i"]:
                    ustem = getrmparam(t, "1")
                    stem = getrmparam(t, "2")
                    stem = stem or ustem
                    unstressedi = u"и"
                    stressedi = u"и́"
                    assert len(stressedi) == 2
                    if stem.endswith(unstressedi):
                        stem = stem[0:-1]
                    elif stem.endswith(stressedi):
                        stem = stem[0:-2]
                    else:
                        pagemsg(
                            u"WARNING: Stem %s doesn't end in и in %s, skipping"
                            % (stem, unicode(t)))
                        nochange = True
                        break
                else:
                    pagemsg("WARNING: Unknown directive %s, skipping" %
                            directive)
                    nochange = True
                    break

                change = True
                break
            else:
                if re.match("^ru-noun-", tname()):
                    pagemsg("Encountered unknown noun decl template %s" %
                            unicode(t))
        if change:
            if not stem:
                pagemsg("WARNING: Can't locate stem in %s, skipping" %
                        origtemplate)
                nochange = True
                break
            anim = getrmparam(t, "anim")
            if anim:
                anim = "an"
            n = getrmparam(t, "n")
            notes = getrmparam(t, "note")
            if len(t.params) > 0:
                pagemsg("WARNING: Extraneous parameters in %s, skipping" %
                        unicode(t))
                nochange = True
                break
            addparam(t, "1", accentclass)
            addparam(t, "2", stem)
            addparam(t, "3", declclass)
            if bare:
                addparam(t, "4", bare)
            if acc_sg:
                addparam(t, "acc_sg", acc_sg)
            if loc_sg:
                addparam(t, "loc", loc_sg)
            if anim:
                addparam(t, "a", anim)
            if n:
                addparam(t, "n", n)
            if notes:
                addparam(t, "notes", notes)
            t.name = "ru-noun-table"
            pagemsg("Rewrote %s as %s" % (origtemplate, unicode(t)))
            oldtemps.append(origname)
    if nochange:
        return None, ""
    if oldtemps:
        comment = "convert %s -> ru-noun-table" % ", ".join(oldtemps)
    else:
        comment = None
    return text, comment
Esempio n. 25
0
def rewrite_one_page_ru_decl_adj(page, index, text):
    oldtemps = []
    pagename = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagename, txt))

    for t in text.filter_templates():
        converted = True

        def tname():
            return unicode(t.name).strip()

        origname = tname()
        origtemplate = unicode(t)
        if tname() == "ru-adj-table":
            t.name = "ru-decl-adj"
        else:
            if re.match("^ru-adjective[0-9]", tname()):
                t.name = tname().replace("ru-adjective", "ru-adj")
            if tname() == "ru-passive participle decl":
                t.name = "ru-adj1"
            suffix = None
            if tname() == "ru-adj3-sja":
                suffix = u"ся"
                t.name = "ru-adj3"
            elif tname() == "ru-adj5-suffix":
                suffix = "-" + getparam(t, "8")
                t.name = "ru-adj5"
            if tname() in ending_for_ru_adj:
                if tname() == "ru-adj13":
                    addparam(t, "2", ending_for_ru_adj[tname()])
                    rmparam(t, "8")
                    rmparam(t, "7")
                    rmparam(t, "6")
                    rmparam(t, "5")
                    rmparam(t, "4")
                    rmparam(t, "3")
                elif tname() in ["ru-adj7", "ru-adj8", "ru-adj9", "ru-adj12"]:
                    addparam(t, "1", getparam(t, "2").strip())
                    addparam(t, "2", ending_for_ru_adj[tname()])
                    rmparam(t, "8")
                    rmparam(t, "7")
                    rmparam(t, "6")
                    rmparam(t, "5")
                    rmparam(t, "4")
                    rmparam(t, "3")
                else:
                    addparam(t, "1", getparam(t, "2").strip())
                    addparam(t, "2", ending_for_ru_adj[tname()])
                    mshort = clean(getparam(t, "3"))
                    if mshort and re.search(u"[аяоеыи]$",
                                            remove_diacritics(mshort)):
                        pagemsg(
                            "WARNING: short masculine %s doesn't have right ending"
                            % mshort)
                    fshort = clean(getparam(t, "4"))
                    if fshort and not re.search(u"[ая]$",
                                                remove_diacritics(fshort)):
                        pagemsg(
                            "WARNING: short feminine %s doesn't have right ending"
                            % fshort)
                    nshort = clean(getparam(t, "5"))
                    if nshort and not re.search(u"[ое]$",
                                                remove_diacritics(nshort)):
                        pagemsg(
                            "WARNING: short neuter %s doesn't have right ending"
                            % nshort)
                    pshort = clean(getparam(t, "6"))
                    if pshort and not re.search(u"[ыи]$",
                                                remove_diacritics(pshort)):
                        pagemsg(
                            "WARNING: short plural %s doesn't have right ending"
                            % pshort)
                    rmparam(t, "8")
                    rmparam(t, "7")
                    rmparam(t, "6")
                    rmparam(t, "5")
                    rmparam(t, "4")
                    rmparam(t, "3")
                    if mshort:
                        addparam(t, "3", mshort)
                    # Note that fshort and nshort get reversed
                    if nshort:
                        addparam(t, "4", nshort)
                    if fshort:
                        addparam(t, "5", fshort)
                    if pshort:
                        addparam(t, "6", pshort)
                if suffix:
                    addparam(t, "suffix", suffix)
                t.name = "ru-decl-adj"
                pagemsg("Rewrote %s as %s" % (origtemplate, unicode(t)))
            else:
                converted = False
        if converted:
            oldtemps.append(origname)
    if oldtemps:
        comment = "convert %s -> ru-decl-adj" % ", ".join(oldtemps)
    else:
        comment = None
    return text, comment
Esempio n. 26
0
def rewrite_one_page_arz_headword(page, index, text):
  temps_changed = []
  for t in text.filter_templates():
    if unicode(t.name) == "arz-noun":
      head = getparam(t, "head")
      rmparam(t, "head")
      tr = getparam(t, "tr")
      rmparam(t, "tr")
      sort = getparam(t, "sort")
      rmparam(t, "sort")
      g = getparam(t, "g")
      rmparam(t, "g")
      g2 = getparam(t, "g2")
      rmparam(t, "g2")
      pl = getparam(t, "2")
      rmparam(t, "2")
      pltr = getparam(t, "3")
      rmparam(t, "3")
      addparam(t, "1", head)
      addparam(t, "2", g)
      if g2:
        addparam(t, "g2", g2)
      if tr:
        addparam(t, "tr", tr)
      if pl:
        addparam(t, "pl", pl)
      if pltr:
        addparam(t, "pltr", pltr)
      if sort:
        addparam(t, "sort", sort)
      temps_changed.append("arz-noun")
    elif unicode(t.name) == "arz-adj":
      head = getparam(t, "head")
      rmparam(t, "head")
      tr = getparam(t, "tr")
      rmparam(t, "tr")
      sort = getparam(t, "sort")
      rmparam(t, "sort")
      pl = getparam(t, "pwv") or getparam(t, "p")
      rmparam(t, "pwv")
      rmparam(t, "p")
      pltr = getparam(t, "ptr")
      rmparam(t, "ptr")
      f = getparam(t, "fwv") or getparam(t, "f")
      rmparam(t, "fwv")
      rmparam(t, "f")
      ftr = getparam(t, "ftr")
      rmparam(t, "ftr")
      addparam(t, "1", head)
      if tr:
        addparam(t, "tr", tr)
      if f:
        addparam(t, "f", f)
      if ftr:
        addparam(t, "ftr", ftr)
      if pl:
        addparam(t, "pl", pl)
      if pltr:
        addparam(t, "pltr", pltr)
      if sort:
        addparam(t, "sort", sort)
      temps_changed.append("arz-adj")
  return text, "rewrite %s to new style" % ", ".join(temps_changed)
Esempio n. 27
0
def vocalize_head(pagetitle, index, template):
    paramschanged = []
    #pagetitle = unicode(page.title(withNamespace=False))

    # Handle existing 1= and head from page title
    if template.has("tr"):

        # Check for multiple transliterations of head or 1. If so, split on
        # the multiple transliterations, with separate vocalized heads.
        latin = getparam(template, "tr")
        if "," in latin:
            trs = re.split(",\\s*", latin)
            # Find the first alternate head (head2, head3, ...) not already present
            i = 2
            while template.has("head" + str(i)):
                i += 1
            addparam(template, "tr", trs[0])
            if template.has("1"):
                head = getparam(template, "1")
                # for new heads, only use existing head in 1= if ends with -un (tanwīn),
                # because many of the existing 1= values are vocalized according to the
                # first transliterated entry in the list and won't work with the others
                if not head.endswith(u"\u064C"):
                    head = pagetitle
            else:
                head = pagetitle
            for tr in trs[1:]:
                addparam(template, "head" + str(i), head)
                addparam(template, "tr" + str(i), tr)
                i += 1
            paramschanged.append("split translit into multiple heads")

        # Try to vocalize 1=
        result = vocalize_param(pagetitle, index, template, "1", "tr")
        if isinstance(result, basestring):
            paramschanged.append("1")

        # If 1= not found, try vocalizing the page title and make it the 1= value
        if not result:
            arabic = unicode(pagetitle)
            latin = getparam(template, "tr")
            if arabic and latin:
                vocalized = do_vocalize_param(pagetitle, index, template,
                                              "page title", arabic, latin)
                if vocalized:
                    oldtempl = "%s" % unicode(template)
                    if template.has("2"):
                        addparam(template, "1", vocalized, before="2")
                    else:
                        addparam(template, "1", vocalized, before="tr")
                    paramschanged.append("1")
                    msg("Page %s %s: Replaced %s with %s" %
                        (index, pagetitle, oldtempl, unicode(template)))

    # Check and try to vocalize extra heads
    i = 2
    result = True
    while result:
        thisparam = "head" + str(i)
        result = vocalize_param(pagetitle, index, template, thisparam,
                                "tr" + str(i))
        if isinstance(result, basestring):
            paramschanged.append(thisparam)
        i += 1
    return paramschanged
Esempio n. 28
0
def vocalize_head(pagetitle, index, template):
  paramschanged = []
  #pagetitle = unicode(page.title(withNamespace=False))

  # Handle existing 1= and head from page title
  if template.has("tr"):

    # Check for multiple transliterations of head or 1. If so, split on
    # the multiple transliterations, with separate vocalized heads.
    latin = getparam(template, "tr")
    if "," in latin:
      trs = re.split(",\\s*", latin)
      # Find the first alternate head (head2, head3, ...) not already present
      i = 2
      while template.has("head" + str(i)):
        i += 1
      addparam(template, "tr", trs[0])
      if template.has("1"):
        head = getparam(template, "1")
        # for new heads, only use existing head in 1= if ends with -un (tanwīn),
        # because many of the existing 1= values are vocalized according to the
        # first transliterated entry in the list and won't work with the others
        if not head.endswith(u"\u064C"):
          head = pagetitle
      else:
        head = pagetitle
      for tr in trs[1:]:
        addparam(template, "head" + str(i), head)
        addparam(template, "tr" + str(i), tr)
        i += 1
      paramschanged.append("split translit into multiple heads")

    # Try to vocalize 1=
    result = vocalize_param(pagetitle, index, template, "1", "tr")
    if isinstance(result, basestring):
      paramschanged.append("1")

    # If 1= not found, try vocalizing the page title and make it the 1= value
    if not result:
      arabic = unicode(pagetitle)
      latin = getparam(template, "tr")
      if arabic and latin:
        vocalized = do_vocalize_param(pagetitle, index, template, "page title",
            arabic, latin)
        if vocalized:
          oldtempl = "%s" % unicode(template)
          if template.has("2"):
            addparam(template, "1", vocalized, before="2")
          else:
            addparam(template, "1", vocalized, before="tr")
          paramschanged.append("1")
          msg("Page %s %s: Replaced %s with %s" % (index, pagetitle,
            oldtempl, unicode(template)))

  # Check and try to vocalize extra heads
  i = 2
  result = True
  while result:
    thisparam = "head" + str(i)
    result = vocalize_param(pagetitle, index, template, thisparam, "tr" + str(i))
    if isinstance(result, basestring):
      paramschanged.append(thisparam)
    i += 1
  return paramschanged
Esempio n. 29
0
def process_param(pagetitle, index, template, param, paramtr,
    include_tempname_in_changelog=False):
  def pagemsg(text):
    msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param,
      text))
  arabic = getparam(template, param)
  latin = getparam(template, paramtr)
  if include_tempname_in_changelog:
    paramtrname = "%s.%s" % (template.name, paramtr)
  else:
    paramtrname = paramtr
  if not arabic:
    return False
  if latin == "-":
    pagemsg("Translit is '-', skipping")
    return True
  if latin:
    try:
      _, canonlatin = tr_matching(arabic, latin, True, pagemsg)
      if not canonlatin:
        pagemsg("Unable to match-canonicalize %s (%s)" % (arabic, latin))
    except Exception as e:
      pagemsg("Trying to match-canonicalize %s (%s): %s" % (arabic, latin, e))
      canonlatin = None
    try:
      translit = ar_translit.tr(arabic)
      if not translit:
        pagemsg("Unable to auto-translit %s" % arabic)
    except Exception as e:
      pagemsg("Trying to transliterate %s: %s" % (arabic, e))
      translit = None
    if translit and canonlatin:
      if translit == canonlatin:
      #if (translit == canonlatin or
      #    translit == canonlatin + "un" or
      #    translit == u"ʾ" + canonlatin or
      #    translit == u"ʾ" + canonlatin + "un"):
        pagemsg("Removing redundant translit for %s (%s)" % (arabic, latin))
        oldtempl = "%s" % unicode(template)
        template.remove(paramtr)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["remove redundant %s=%s" % (paramtrname, latin)]
      else:
        pagemsg("Auto-translit for %s (%s) not same as manual translit %s (canonicalized %s)" %
            (arabic, translit, latin, canonlatin))
    if canonlatin:
      if latin != canonlatin:
        pagemsg("Match-canonicalizing Latin %s to %s" % (latin, canonlatin))
        oldtempl = "%s" % unicode(template)
        addparam(template, paramtr, canonlatin)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["match-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
      return True
    canonlatin, _ = ar_translit.canonicalize_latin_arabic(latin, None)
    if latin != canonlatin:
      pagemsg("Self-canonicalizing Latin %s to %s" % (latin, canonlatin))
      oldtempl = "%s" % unicode(template)
      addparam(template, paramtr, canonlatin)
      msg("Page %s %s: Replaced %s with %s" %
          (index, pagetitle, oldtempl, unicode(template)))
      return ["self-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
  return True
 def putp(param, value):
     addparam(headword_template, param, value)
Esempio n. 31
0
def canon_head(pagetitle, index, template):
  actions = []
  #pagetitle = unicode(page.title(withNamespace=False))

  # Handle existing 1= and head from page title
  if template.has("tr"):

    # Check for multiple transliterations of head or 1. If so, split on
    # the multiple transliterations, with separate vocalized heads.
    latin = getparam(template, "tr")
    if "," in latin or "/" in latin:
      trs = re.split("\\s*[,/]\\s*", latin)
      # Find the first alternate head (head2, head3, ...) not already present
      i = 2
      while template.has("head" + str(i)):
        i += 1
      addparam(template, "tr", trs[0])
      if template.has("1"):
        head = getparam(template, "1")
        # for new heads, only use existing head in 1= if ends with -un (tanwīn),
        # because many of the existing 1= values are vocalized according to the
        # first transliterated entry in the list and won't work with the others
        if not head.endswith(u"\u064C"):
          head = pagetitle
      else:
        head = pagetitle
      for tr in trs[1:]:
        addparam(template, "head" + str(i), head)
        addparam(template, "tr" + str(i), tr)
        i += 1
      actions.append("split translit into multiple heads")

    # Try to vocalize 1=
    result = canon_param(pagetitle, index, template, "1", "tr")
    if result != False:
      actions.extend(result)

    # If 1= not found, try vocalizing the page title and make it the 1= value
    if result == False:
      arabic = pagetitle
      latin = getparam(template, "tr")
      if arabic and latin:
        canonarabic, canonlatin, newactions = do_canon_param(
            pagetitle, index, template, "page title", "1", "tr", arabic, latin)
        oldtempl = "%s" % unicode(template)
        if canonarabic:
          if template.has("2"):
            addparam(template, "1", canonarabic, before="2")
          else:
            addparam(template, "1", canonarabic, before="tr")
        if canonlatin == True:
          template.remove("tr")
        elif canonlatin:
          addparam(template, "tr", canonlatin)
        actions.extend(newactions)
        if canonarabic or canonlatin:
          msg("Page %s %s: Replaced %s with %s" % (index, pagetitle,
            oldtempl, unicode(template)))

  # Check and try to vocalize extra heads
  i = 2
  result = True
  while result != False:
    thisparam = "head" + str(i)
    result = canon_param(pagetitle, index, template, thisparam, "tr" + str(i))
    if result != False:
      actions.extend(result)
    i += 1
  return actions
Esempio n. 32
0
def rewrite_one_page_idafa(page, index, text):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    num_new_style = 0
    num_modhead_changed = 0
    num_state_ind_to_ind_def = 0
    num_basestate_ind_def = 0
    idafa_added = []
    has_proper_noun = False
    for t in text.filter_templates():
        if t.name == "ar-proper noun":
            has_proper_noun = True
    for t in text.filter_templates():
        if t.name.startswith("ar-decl-"):
            changed = False

            # Change state=ind for proper noun to state=ind-def
            oldt = unicode(t)
            if getparam(t, "state") == "ind" and has_proper_noun:
                addparam(t, "state", "ind-def")
                pagemsg(
                    "Converting state=ind to state=ind-def for proper noun")
                pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
                num_state_ind_to_ind_def += 1
            elif getparam(t, "state") == "def" and getparam(
                    t, "basestate") == "ind":
                t.remove("basestate")
                addparam(t, "state", "ind-def")
                pagemsg("Converting state=def|basestate=ind to state=ind-def")
                pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
                num_basestate_ind_def += 1

            # Change old-style ʾidāfa (state=con) to new-style (basestate=con)
            #oldt = unicode(t)
            #if (getparam(t, "state") == "con" and getparam(t, "modcase") and
            #    not getparam(t, "basestate")):
            #  modstate = getparam(t, "modstate")
            #  addparam(t, "basestate", "con")
            #  addparam(t, "modidafa", "yes")
            #  if not modstate:
            #    t.remove("state")
            #  else:
            #    addparam(t, "state", modstate)
            #  pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
            #  changed = True

            # Remove manual ʾidāfa params when possible and substitute idafa=
            #oldt = unicode(t)
            #if getparam(t, "basestate") == "con" and getparam(t, "modcase") == "gen":
            #  idafa = ""
            #  modnumber = getparam(t, "modnumber")
            #  if not modnumber:
            #    pagemsg("WARNING: Missing modnumber= in idafa template, substituting sg: %s" %
            #        unicode(t))
            #    modnumber = "sg"
            #    addparam(t, "modnumber", "sg")
            #  modstate = getparam(t, "modstate")
            #  state = getparam(t, "state")
            #  if not modstate:
            #    if state:
            #      pagemsg("WARNING: Extraneous state= in idafa template: %s" %
            #          unicode(t))
            #    idafa = modnumber
            #  elif state != modstate:
            #    pagemsg("WARNING: modstate= in idafa template but state= doesn't match: %s"
            #        % unicode(t))
            #  else:
            #    idafa = "%s-%s" % (modstate, modnumber)
            #    t.remove("state")
            #    t.remove("modstate")
            #  if idafa:
            #    t.remove("basestate")
            #    t.remove("modcase")
            #    t.remove("modnumber")
            #    t.remove("modidafa")
            #    m = re.match("^ind-(.*)$", idafa)
            #    if m:
            #      if has_proper_noun:
            #        pagemsg("Not replacing idafa state 'ind' because proper noun: %s"
            #            % unicode(t))
            #      elif pagetitle in [u"أقدم مهنة", u"غير طبيعي"]:
            #        pagemsg("Not replacing idafa state 'ind' because it's special-cased: %s" % unicode(t))
            #      else:
            #        pagemsg("NOTE: Replacing idafa state 'ind' with no state restriction: %s"
            #            % unicode(t))
            #        idafa = m.group(1)
            #    m = re.match("^(.*?)-sg$", idafa)
            #    if m:
            #      idafa = m.group(1)
            #    if idafa == "sg":
            #      idafa = "yes"
            #    addparam(t, "idafa", idafa)
            #    pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
            #    idafa_added.append(idafa)
            #  elif changed:
            #    num_new_style += 1

            if (getparam(t, "basestate") or getparam(t, "modcase")
                    or getparam(t, "modstate") or getparam(t, "modnumber")
                    or getparam(t, "modidafa")):
                pagemsg("WARNING: idafa params remain after processing: %s" %
                        unicode(t))

            ## Change modN into modheadN
            #oldt = unicode(t)
            #changed = False
            #for i in xrange(2, 20):
            #  modn = getparam(t, "mod" + str(i))
            #  if modn:
            #    t.remove("mod" + str(i))
            #    addparam(t, "modhead" + str(i), modn)
            #    changed = True
            #if changed:
            #  pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
            #  num_modhead_changed += 1

            if getparam(t, "omitarticle"):
                pagemsg("WARNING: omitarticle present: %s" % unicode(t))
            if getparam(t, "state") == "ind":
                pagemsg("WARNING: state=ind still present: %s" % unicode(t))

    actions = []
    if idafa_added:
        actions.append(u"Replaced ʾidāfa params with idafa= param: %s" %
                       (", ".join(idafa_added)))
    if num_new_style:
        actions.append(u"Corrected %s old-style ʾidāfa param(s) to new-style" %
                       num_new_style)
    if num_modhead_changed:
        actions.append(u"Changed modN to modheadN")
    if num_state_ind_to_ind_def:
        actions.append(u"Converted state=ind to state=ind-def for proper noun")
    if num_basestate_ind_def:
        actions.append(u"Converted state=def|basestate=ind to state=ind-def")
    if actions:
        changelog = "; ".join(actions)
        pagemsg("Changelog = %s" % changelog)
        return text, changelog
    return text, ""
Esempio n. 33
0
def canon_head(pagetitle, index, template):
    actions = []
    #pagetitle = unicode(page.title(withNamespace=False))

    # Handle existing 1= and head from page title
    if template.has("tr"):

        # Check for multiple transliterations of head or 1. If so, split on
        # the multiple transliterations, with separate vocalized heads.
        latin = getparam(template, "tr")
        if "," in latin or "/" in latin:
            trs = re.split("\\s*[,/]\\s*", latin)
            # Find the first alternate head (head2, head3, ...) not already present
            i = 2
            while template.has("head" + str(i)):
                i += 1
            addparam(template, "tr", trs[0])
            if template.has("1"):
                head = getparam(template, "1")
                # for new heads, only use existing head in 1= if ends with -un (tanwīn),
                # because many of the existing 1= values are vocalized according to the
                # first transliterated entry in the list and won't work with the others
                if not head.endswith(u"\u064C"):
                    head = pagetitle
            else:
                head = pagetitle
            for tr in trs[1:]:
                addparam(template, "head" + str(i), head)
                addparam(template, "tr" + str(i), tr)
                i += 1
            actions.append("split translit into multiple heads")

        # Try to vocalize 1=
        result = canon_param(pagetitle, index, template, "1", "tr")
        if result != False:
            actions.extend(result)

        # If 1= not found, try vocalizing the page title and make it the 1= value
        if result == False:
            arabic = pagetitle
            latin = getparam(template, "tr")
            if arabic and latin:
                canonarabic, canonlatin, newactions = do_canon_param(
                    pagetitle, index, template, "page title", "1", "tr",
                    arabic, latin)
                oldtempl = "%s" % unicode(template)
                if canonarabic:
                    if template.has("2"):
                        addparam(template, "1", canonarabic, before="2")
                    else:
                        addparam(template, "1", canonarabic, before="tr")
                if canonlatin == True:
                    template.remove("tr")
                elif canonlatin:
                    addparam(template, "tr", canonlatin)
                actions.extend(newactions)
                if canonarabic or canonlatin:
                    msg("Page %s %s: Replaced %s with %s" %
                        (index, pagetitle, oldtempl, unicode(template)))

    # Check and try to vocalize extra heads
    i = 2
    result = True
    while result != False:
        thisparam = "head" + str(i)
        result = canon_param(pagetitle, index, template, thisparam,
                             "tr" + str(i))
        if result != False:
            actions.extend(result)
        i += 1
    return actions
Esempio n. 34
0
def rewrite_one_page_arz_headword(page, index, text):
    temps_changed = []
    for t in text.filter_templates():
        if unicode(t.name) == "arz-noun":
            head = getparam(t, "head")
            rmparam(t, "head")
            tr = getparam(t, "tr")
            rmparam(t, "tr")
            sort = getparam(t, "sort")
            rmparam(t, "sort")
            g = getparam(t, "g")
            rmparam(t, "g")
            g2 = getparam(t, "g2")
            rmparam(t, "g2")
            pl = getparam(t, "2")
            rmparam(t, "2")
            pltr = getparam(t, "3")
            rmparam(t, "3")
            addparam(t, "1", head)
            addparam(t, "2", g)
            if g2:
                addparam(t, "g2", g2)
            if tr:
                addparam(t, "tr", tr)
            if pl:
                addparam(t, "pl", pl)
            if pltr:
                addparam(t, "pltr", pltr)
            if sort:
                addparam(t, "sort", sort)
            temps_changed.append("arz-noun")
        elif unicode(t.name) == "arz-adj":
            head = getparam(t, "head")
            rmparam(t, "head")
            tr = getparam(t, "tr")
            rmparam(t, "tr")
            sort = getparam(t, "sort")
            rmparam(t, "sort")
            pl = getparam(t, "pwv") or getparam(t, "p")
            rmparam(t, "pwv")
            rmparam(t, "p")
            pltr = getparam(t, "ptr")
            rmparam(t, "ptr")
            f = getparam(t, "fwv") or getparam(t, "f")
            rmparam(t, "fwv")
            rmparam(t, "f")
            ftr = getparam(t, "ftr")
            rmparam(t, "ftr")
            addparam(t, "1", head)
            if tr:
                addparam(t, "tr", tr)
            if f:
                addparam(t, "f", f)
            if ftr:
                addparam(t, "ftr", ftr)
            if pl:
                addparam(t, "pl", pl)
            if pltr:
                addparam(t, "pltr", pltr)
            if sort:
                addparam(t, "sort", sort)
            temps_changed.append("arz-adj")
    return text, "rewrite %s to new style" % ", ".join(temps_changed)
Esempio n. 35
0
 def putp(param, value):
   addparam(headword_template, param, value)
Esempio n. 36
0
def rewrite_one_page_idafa(page, index, text):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  num_new_style = 0
  num_modhead_changed = 0
  num_state_ind_to_ind_def = 0
  num_basestate_ind_def = 0
  idafa_added = []
  has_proper_noun = False
  for t in text.filter_templates():
    if t.name == "ar-proper noun":
      has_proper_noun = True
  for t in text.filter_templates():
    if t.name.startswith("ar-decl-"):
      changed = False

      # Change state=ind for proper noun to state=ind-def
      oldt = unicode(t)
      if getparam(t, "state") == "ind" and has_proper_noun:
        addparam(t, "state", "ind-def")
        pagemsg("Converting state=ind to state=ind-def for proper noun")
        pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
        num_state_ind_to_ind_def += 1
      elif getparam(t, "state") == "def" and getparam(t, "basestate") == "ind":
        t.remove("basestate")
        addparam(t, "state", "ind-def")
        pagemsg("Converting state=def|basestate=ind to state=ind-def")
        pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
        num_basestate_ind_def += 1

      # Change old-style ʾidāfa (state=con) to new-style (basestate=con)
      #oldt = unicode(t)
      #if (getparam(t, "state") == "con" and getparam(t, "modcase") and
      #    not getparam(t, "basestate")):
      #  modstate = getparam(t, "modstate")
      #  addparam(t, "basestate", "con")
      #  addparam(t, "modidafa", "yes")
      #  if not modstate:
      #    t.remove("state")
      #  else:
      #    addparam(t, "state", modstate)
      #  pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
      #  changed = True

      # Remove manual ʾidāfa params when possible and substitute idafa=
      #oldt = unicode(t)
      #if getparam(t, "basestate") == "con" and getparam(t, "modcase") == "gen":
      #  idafa = ""
      #  modnumber = getparam(t, "modnumber")
      #  if not modnumber:
      #    pagemsg("WARNING: Missing modnumber= in idafa template, substituting sg: %s" %
      #        unicode(t))
      #    modnumber = "sg"
      #    addparam(t, "modnumber", "sg")
      #  modstate = getparam(t, "modstate")
      #  state = getparam(t, "state")
      #  if not modstate:
      #    if state:
      #      pagemsg("WARNING: Extraneous state= in idafa template: %s" %
      #          unicode(t))
      #    idafa = modnumber
      #  elif state != modstate:
      #    pagemsg("WARNING: modstate= in idafa template but state= doesn't match: %s"
      #        % unicode(t))
      #  else:
      #    idafa = "%s-%s" % (modstate, modnumber)
      #    t.remove("state")
      #    t.remove("modstate")
      #  if idafa:
      #    t.remove("basestate")
      #    t.remove("modcase")
      #    t.remove("modnumber")
      #    t.remove("modidafa")
      #    m = re.match("^ind-(.*)$", idafa)
      #    if m:
      #      if has_proper_noun:
      #        pagemsg("Not replacing idafa state 'ind' because proper noun: %s"
      #            % unicode(t))
      #      elif pagetitle in [u"أقدم مهنة", u"غير طبيعي"]:
      #        pagemsg("Not replacing idafa state 'ind' because it's special-cased: %s" % unicode(t))
      #      else:
      #        pagemsg("NOTE: Replacing idafa state 'ind' with no state restriction: %s"
      #            % unicode(t))
      #        idafa = m.group(1)
      #    m = re.match("^(.*?)-sg$", idafa)
      #    if m:
      #      idafa = m.group(1)
      #    if idafa == "sg":
      #      idafa = "yes"
      #    addparam(t, "idafa", idafa)
      #    pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
      #    idafa_added.append(idafa)
      #  elif changed:
      #    num_new_style += 1

      if (getparam(t, "basestate") or getparam(t, "modcase") or
          getparam(t, "modstate") or getparam(t, "modnumber") or
          getparam(t, "modidafa")):
        pagemsg("WARNING: idafa params remain after processing: %s" %
            unicode(t))

      ## Change modN into modheadN
      #oldt = unicode(t)
      #changed = False
      #for i in xrange(2, 20):
      #  modn = getparam(t, "mod" + str(i))
      #  if modn:
      #    t.remove("mod" + str(i))
      #    addparam(t, "modhead" + str(i), modn)
      #    changed = True
      #if changed:
      #  pagemsg("Replacing %s with %s" % (oldt, unicode(t)))
      #  num_modhead_changed += 1

      if getparam(t, "omitarticle"):
        pagemsg("WARNING: omitarticle present: %s" % unicode(t))
      if getparam(t, "state") == "ind":
        pagemsg("WARNING: state=ind still present: %s" % unicode(t))

  actions = []
  if idafa_added:
    actions.append(u"Replaced ʾidāfa params with idafa= param: %s" % (
        ", ".join(idafa_added)))
  if num_new_style:
    actions.append(u"Corrected %s old-style ʾidāfa param(s) to new-style"
        % num_new_style)
  if num_modhead_changed:
    actions.append(u"Changed modN to modheadN")
  if num_state_ind_to_ind_def:
    actions.append(u"Converted state=ind to state=ind-def for proper noun")
  if num_basestate_ind_def:
    actions.append(u"Converted state=def|basestate=ind to state=ind-def")
  if actions:
    changelog = "; ".join(actions)
    pagemsg("Changelog = %s" % changelog)
    return text, changelog
  return text, ""
Esempio n. 37
0
def process_param(pagetitle, index, template, param, paramtr,
    include_tempname_in_changelog=False):
  def pagemsg(text):
    msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param,
      text))
  arabic = getparam(template, param)
  latin = getparam(template, paramtr)
  if include_tempname_in_changelog:
    paramtrname = "%s.%s" % (template.name, paramtr)
  else:
    paramtrname = paramtr
  if not arabic:
    return False
  if latin == "-":
    pagemsg("Translit is '-', skipping")
    return True
  if latin:
    try:
      _, canonlatin = tr_matching(arabic, latin, True, pagemsg)
      if not canonlatin:
        pagemsg("Unable to match-canonicalize %s (%s)" % (arabic, latin))
    except Exception as e:
      pagemsg("Trying to match-canonicalize %s (%s): %s" % (arabic, latin, e))
      canonlatin = None
    try:
      translit = ar_translit.tr(arabic)
      if not translit:
        pagemsg("Unable to auto-translit %s" % arabic)
    except Exception as e:
      pagemsg("Trying to transliterate %s: %s" % (arabic, e))
      translit = None
    if translit and canonlatin:
      if translit == canonlatin:
      #if (translit == canonlatin or
      #    translit == canonlatin + "un" or
      #    translit == u"ʾ" + canonlatin or
      #    translit == u"ʾ" + canonlatin + "un"):
        pagemsg("Removing redundant translit for %s (%s)" % (arabic, latin))
        oldtempl = "%s" % unicode(template)
        template.remove(paramtr)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["remove redundant %s=%s" % (paramtrname, latin)]
      else:
        pagemsg("Auto-translit for %s (%s) not same as manual translit %s (canonicalized %s)" %
            (arabic, translit, latin, canonlatin))
    if canonlatin:
      if latin != canonlatin:
        pagemsg("Match-canonicalizing Latin %s to %s" % (latin, canonlatin))
        oldtempl = "%s" % unicode(template)
        addparam(template, paramtr, canonlatin)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["match-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
      return True
    canonlatin, _ = ar_translit.canonicalize_latin_arabic(latin, None)
    if latin != canonlatin:
      pagemsg("Self-canonicalizing Latin %s to %s" % (latin, canonlatin))
      oldtempl = "%s" % unicode(template)
      addparam(template, paramtr, canonlatin)
      msg("Page %s %s: Replaced %s with %s" %
          (index, pagetitle, oldtempl, unicode(template)))
      return ["self-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
  return True