Esempio n. 1
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if name in fr_head_templates:
      rmparam(t, "sort")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replacing %s with %s" % (origt, newt))
      notes.append("remove sort= from {{%s}}" % name)

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 2
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  text = unicode(page.text)
  parsed = blib.parse(page)

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-IPA":
      origt = unicode(t)
      if getparam(t, "phon"):
        pagemsg("phon= already present: %s" % unicode(t))
      else:
        phon = getparam(t, "1")
        pagemsg("Adding phon=: %s" % unicode(t))
        rmparam(t, "1")
        t.add("phon", phon)
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  newtext = unicode(parsed)

  if newtext != text:
    if verbose:
      pagemsg("Replacing <<%s>> with <<%s>>" % (text, newtext))
    comment = "Add phon= to ru-IPA templates"
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
  else:
    pagemsg("Skipping")
Esempio n. 3
0
 def fix_cite_book_params(t):
   origt = unicode(t)
   if getparam(t, "origyear").strip() and getparam(t, "year").strip():
     if getparam(t, "year_published"):
       pagemsg("WARNING: Would set year_published= but is already present: %s"
           % unicode(t))
     else:
       rmparam(t, "year_published") # in case of blank param
       t.get("year").name = "year_published"
       t.get("origyear").name = "year"
       pagemsg("year -> year_published, origyear -> year")
   move_param(t, "origdate", "date")
   move_param(t, "origmonth", "month")
   def frob_isbn(idval):
     isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)"
     if re.search(isbn_re, idval, re.I):
       return re.sub(isbn_re, r"\1", idval, 0, re.I)
     elif re.search(r"^[0-9]", idval.strip()):
       return idval
     else:
       pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" %
           idval.replace("\n", r"\n"))
       return None
   move_param(t, "id", "isbn", frob_isbn)
   fix_page_params(t)
   return origt != unicode(t)
Esempio n. 4
0
 def fix_quote_usenet_params(t):
   origt = unicode(t)
   monthday = getparam(t, "monthday").strip()
   year = getparam(t, "year").strip()
   if monthday and year:
     if getparam(t, "date"):
       pagemsg("WARNING: Would set date= but is already present: %s"
           % unicode(t))
     else:
       rmparam(t, "date") # in case of blank param
       param = t.get("monthday")
       param.name = "date"
       if re.search("^[0-9]+/[0-9]+$", monthday):
         param.value = "%s/%s" % (monthday, year)
       else:
         param.value = "%s %s" % (monthday, year)
       rmparam(t, "year")
       pagemsg("monthday/year -> date")
   move_param(t, "group", "newsgroup")
   move_param(t, "text", "passage")
   move_param(t, "6", "passage")
   move_param(t, "5", "url")
   move_param(t, "4", "newsgroup")
   move_param(t, "3", "title")
   move_param(t, "2", "author")
   move_param(t, "1", "date")
   return origt != unicode(t)
Esempio n. 5
0
def process_page(templates, index, page, save=False, verbose=False):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist")
    return

  parsed = blib.parse(page)

  should_save = False

  for t in parsed.filter_templates():

    if unicode(t.name) in templates:
      origt = unicode(t)
      # Punt if multi-arg-set, can't handle yet
      should_continue = False
      for param in t.params:
        if not param.showkey:
          val = unicode(param.value)
          if val == "or":
            pagemsg("WARNING: Can't handle multi-decl templates: %s" % unicode(t))
            should_continue = True
            break
          if val == "-" or val == "_" or val.startswith("join:"):
            pagemsg("WARNING: Can't handle multi-word templates: %s" % unicode(t))
            should_continue = True
            break
      if should_continue:
        continue

      if arg1_is_stress(getparam(t, "1")):
        oldplarg = "5"
        newplarg = "4"
      else:
        oldplarg = "4"
        newplarg = "3"
      plstem = getparam(t, oldplarg)
      if plstem:
        if getparam(t, newplarg):
          pagemsg("WARNING: Something wrong, found args in both positions %s and %s: %s" %
              (newplarg, oldplarg, unicode(t)))
          continue
        rmparam(t, oldplarg)
        t.add(newplarg, plstem)
        should_save = True
        pagemsg("Replacing %s with %s" % (origt, unicode(t)))

  if should_save:
    comment = "Move plstem from 5th/4th argument to 4th/3rd"
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = unicode(parsed)
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 6
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    param2 = getparam(t, "2")
    if unicode(t.name) in ["ru-conj"] and re.search(r"^8[ab]", param2):
      if [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
        continue
      past_m = getparam(t, "past_m")
      if past_m:
        rmparam(t, "past_m")
        stem = getparam(t, "3")
        if stem == past_m:
          pagemsg("Stem %s and past_m same" % stem)
          notes.append("remove redundant past_m %s" % past_m)
        elif (param2.startswith("8b") and not param2.startswith("8b/") and
            ru.make_unstressed(past_m) == stem):
          pagemsg("Class 8b/b and stem %s is unstressed version of past_m %s, replacing stem with past_m" % (
            stem, past_m))
          t.add("3", past_m)
          notes.append("moving past_m %s to arg 3" % past_m)
        else:
          pagemsg("Stem %s and past_m %s are different, putting past_m in param 5" % (
            stem, past_m))
          t.add("5", past_m)
          notes.append("moving past_m %s to arg 5" % past_m)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    param1 = getparam(t, "1")
    if unicode(t.name) in ["ru-conj"]:
      if re.search(r"^6[ac]", param1):
        if getparam(t, "no_iotation"):
          rmparam(t, "no_iotation")
          if param1.startswith("6a"):
            notes.append(u"6a + no_iotation -> 6°a")
          else:
            notes.append(u"6c + no_iotation -> 6°c")
          t.add("1", re.sub("^6", u"6°", param1))
      elif re.search(r"^6b", param1):
        notes.append(u"6b -> 6°b")
        t.add("1", re.sub("^6", u"6°", param1))
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 8
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-phrase":
      if t.has("tr"):
        pagemsg("WARNING: Has tr=: %s" % unicode(t))
      if t.has("head"):
        if t.has("1"):
          pagemsg("WARNING: Has both head= and 1=: %s" % unicode(t))
        else:
          notes.append("ru-phrase: convert head= to 1=")
          origt = unicode(t)
          head = getparam(t, "head")
          rmparam(t, "head")
          tr = getparam(t, "tr")
          rmparam(t, "tr")
          t.add("1", head)
          if tr:
            t.add("tr", tr)
          pagemsg("Replacing %s with %s" % (origt, unicode(t)))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Remove gender from adjective forms
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form":
          origt = unicode(t)
          rmparam(t, "g")
          rmparam(t, "g2")
          rmparam(t, "g3")
          rmparam(t, "g4")
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("remove gender from adjective forms")
      sections[j] = unicode(parsed)
  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 10
0
def process_page(index, page, save, verbose, genders):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub(".*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  parsed = blib.parse(page)

  headword_template = None

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      if headword_template:
        pagemsg("WARNING: Multiple headword templates, skipping")
        return
      headword_template = t
  if not headword_template:
    pagemsg("WARNING: No headword templates, skipping")
    return

  orig_template = unicode(headword_template)
  rmparam(headword_template, "g")
  rmparam(headword_template, "g2")
  rmparam(headword_template, "g3")
  rmparam(headword_template, "g4")
  rmparam(headword_template, "g5")
  for gnum, g in enumerate(genders):
    param = "g" if gnum == 0 else "g" + str(gnum+1)
    headword_template.add(param, g)
  pagemsg("Replacing %s with %s" % (orig_template, unicode(headword_template)))

  comment = "Fix headword gender, substituting new value %s" % ",".join(genders)
  if save:
    pagemsg("Saving with comment = %s" % comment)
    page.text = unicode(parsed)
    page.save(comment=comment)
  else:
    pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if not pagetitle.endswith(u"ся"):
    return

  text = unicode(page.text)
  notes = []

  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-decl-adj", "ru-adj-old"] and getparam(t, "suffix") == u"ся":
      lemma = getparam(t, "1")
      lemma = re.sub(",", u"ся,", lemma)
      lemma = re.sub("$", u"ся", lemma)
      t.add("1", lemma)
      rmparam(t, "suffix")
      notes.append(u"move suffix=ся to lemma")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 12
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj-4a"]:
      shch = getparam(t, "4")
      if shch == u"щ":
        t.add("3", getparam(t, "3") + shch)
        rmparam(t, "4")
        notes.append(u"move param 4 (щ) to param 3")
      elif shch:
        pagemsg("WARNING: Strange value %s for param 4" % shch)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 13
0
  def move_param(t, fr, to, frob_from=None):
    if t.has(fr):
      oldval = getparam(t, fr)
      if not oldval.strip():
        rmparam(t, fr)
        pagemsg("Removing blank param %s" % fr)
        return
      if frob_from:
        newval = frob_from(oldval)
        if not newval or not newval.strip():
          return
      else:
        newval = oldval

      if getparam(t, to).strip():
          pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s"
              % (fr, to, to, unicode(t)))
      elif oldval != newval:
        rmparam(t, to) # in case of blank param
        # If either old or new name is a number, use remove/add to automatically set the
        # showkey value properly; else it's safe to just change the name of the param,
        # which will preserve its location.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          tfr = t.get(fr)
          tfr.name = to
          tfr.value = newval
        pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to,
          newval.replace("\n", r"\n")))
      else:
        rmparam(t, to) # in case of blank param
        # See comment above.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          t.get(fr).name = to
        pagemsg("%s -> %s" % (fr, to))
Esempio n. 14
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-ux":
      origt = unicode(t)
      if t.has("adj"):
        pagemsg("Removing adj=")
        notes.append("remove adj= from ru-ux")
        rmparam(t, "adj")
      if t.has("shto"):
        pagemsg("Removing shto=")
        notes.append("remove shto= from ru-ux")
        rmparam(t, "shto")
      newt = unicode(t)
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 15
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj-5c", "ru-conj-6b"]:
      past_f = getparam(t, "4")
      if past_f:
        t.add("past_f", past_f, before="4")
        rmparam(t, "4")
        notes.append("Replace 4= with past_f=")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj"]:
      conjtype = getparam(t, "1")
      if conjtype.startswith("6a"):
        param6 = getparam(t, "6")
        if param6:
          rmparam(t, "6")
          if not getparam(t, "5"):
            rmparam(t, "5")
          for i in xrange(1, 4):
            if not t.has(str(i)):
              t.add(str(i), "")
          t.add("4", param6)
          notes.append("move type 6a arg6 -> arg4")
      if conjtype.startswith("7b"):
        param7 = getparam(t, "7")
        if param7:
          rmparam(t, "7")
          for i in xrange(1, 6):
            if not t.has(str(i)):
              t.add(str(i), "")
          t.add("6", param7)
          notes.append("move type 7b arg7 -> arg6")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_text_on_page(index, pagename, text, verbs):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagename, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagename, txt))

    pagemsg("Processing")

    notes = []

    if args.mode == "full-conj":
        if pagename not in verbs:
            pagemsg("WARNING: Couldn't find entry for pagename")
            return

        parsed = blib.parse_text(text)
        for t in parsed.filter_templates():
            tn = tname(t)
            origt = unicode(t)
            if tn == "head" and getparam(t, "1") == "en" and getparam(
                    t, "2") == "verb":
                if getparam(t, "3"):
                    pagemsg("WARNING: Already has 3=, not touching: %s" %
                            unicode(t))
                    continue
                blib.set_template_name(t, "en-verb")
                t.add("1", verbs[pagename])
                rmparam(t, "2")
                notes.append(
                    "convert {{head|en|verb}} of multiword expression to {{en-verb}}"
                )
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    else:
        first, rest = pagename.split(" ", 1)
        if first not in verbs:
            pagemsg("WARNING: Couldn't find entry for first=%s" % first)
            return

        parsed = blib.parse_text(text)
        for t in parsed.filter_templates():
            tn = tname(t)
            origt = unicode(t)
            if tn == "head" and getparam(t, "1") == "en" and getparam(
                    t, "2") == "verb":
                if getparam(t, "3"):
                    pagemsg("WARNING: Already has 3=, not touching: %s" %
                            unicode(t))
                    continue
                blib.set_template_name(t, "en-verb")
                done = False
                words = pagename.split(" ")
                plural = False
                for word in words:
                    if singularizable(word):
                        plural = True
                        break
                if plural:
                    if verbs[first].startswith("<"):
                        restwords = []
                        for word in words[1:]:
                            restwords.append(link(word))
                        param1 = "[[%s]]%s %s" % (first, verbs[first],
                                                  " ".join(restwords))
                        head_from_param = re.sub("<.*?>", "", param1)
                        existing_head = getparam(t, "head")
                        canon_existing_head = canonicalize_existing_linked_head(
                            existing_head, pagemsg)
                        if canon_existing_head == head_from_param:
                            pagemsg("Removing existing head %s" %
                                    existing_head)
                            rmparam(t, "head")
                            t.add("1", param1)
                            done = True
                        elif canon_existing_head != existing_head:
                            pagemsg(
                                "Replacing existing head %s with canonicalized %s"
                                % (existing_head, canon_existing_head))
                            t.add("head", canon_existing_head)
                            pagemsg(
                                "WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s"
                                %
                                (canon_existing_head, head_from_param, origt))
                        elif existing_head:
                            pagemsg(
                                "WARNING: Existing head not removed (different from head-from-param %s): %s"
                                % (head_from_param, origt))
                        else:
                            t.add("1", param1)
                            done = True
                    else:
                        t.add("1", verbs[first])
                        headwords = []
                        for word in words:
                            if not headwords:  # first word
                                headwords.append("[[" + word + "]]")
                            else:
                                headwords.append(link(word))
                        head_from_param = " ".join(headwords)
                        existing_head = getparam(t, "head")
                        canon_existing_head = canonicalize_existing_linked_head(
                            existing_head, pagemsg)
                        if canon_existing_head == head_from_param:
                            pagemsg("Removing existing head %s" %
                                    existing_head)
                            rmparam(t, "head")
                        elif canon_existing_head != existing_head:
                            pagemsg(
                                "Replacing existing head %s with canonicalized %s"
                                % (existing_head, canon_existing_head))
                            t.add("head", canon_existing_head)
                            pagemsg(
                                "WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s"
                                %
                                (canon_existing_head, head_from_param, origt))
                        elif existing_head:
                            pagemsg(
                                "WARNING: Existing head not removed (different from head-from-param %s): %s"
                                % (head_from_param, origt))
                        else:
                            t.add("head", head_from_param)
                        done = True
                if not done:
                    existing_head = getparam(t, "head")
                    if existing_head:
                        head_from_param = " ".join(
                            "[[%s]]" % word if word != "the" else word
                            for word in pagename.split(" "))
                        canon_existing_head = canonicalize_existing_linked_head(
                            existing_head, pagemsg)
                        if canon_existing_head == head_from_param:
                            pagemsg("Removing existing head %s" %
                                    existing_head)
                            rmparam(t, "head")
                        elif canon_existing_head != existing_head:
                            pagemsg(
                                "Replacing existing head %s with canonicalized %s"
                                % (existing_head, canon_existing_head))
                            t.add("head", canon_existing_head)
                            pagemsg(
                                "WARNING: Existing head not removed (canonicalized to %s, different from head-from-param %s): %s"
                                %
                                (canon_existing_head, head_from_param, origt))
                        else:
                            pagemsg(
                                "WARNING: Existing head not removed (different from head-from-param %s): %s"
                                % (head_from_param, origt))
                    if verbs[first].startswith("<"):
                        t.add("1", "%s%s %s" % (first, verbs[first], rest))
                    else:
                        t.add("1", verbs[first])
                rmparam(t, "2")

                notes.append(
                    "convert {{head|en|verb}} of multiword expression to {{en-verb}}"
                )
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes
Esempio n. 18
0
def process_page(index, page, direc, delete_bad, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  direc = direc.replace("3oa", u"3°a")
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj"]:
      conjtype = getparam(t, "1")
      if not conjtype.startswith("3olda"):
        continue
      if conjtype.startswith("3olda") and conjtype != "3olda":
        pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t))
        continue
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      oldargs = blib.split_generate_args(result)
      rmparam(t, "6")
      rmparam(t, "5")
      rmparam(t, "4")
      t.add("1", direc)
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      if delete_bad:
        newargs = blib.split_generate_args(result)
        for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short",
            "past_f_short", "past_n_short", "past_pl_short"]:
          oldforms = re.split(",", oldargs[form]) if form in oldargs else []
          newforms = re.split(",", newargs[form]) if form in newargs else []
          for oldform in oldforms:
            if oldform not in newforms:
              formpagename = rulib.remove_accents(oldform)
              formpage = pywikibot.Page(site, formpagename)
              if not formpage.exists():
                pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
              elif formpagename == pagetitle:
                pagemsg("WARNING: Attempt to delete dictionary form, skipping")
              else:
                text = unicode(formpage.text)
                if "Etymology 1" in text:
                  pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename)
                elif "----" in text:
                  pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename)
                else:
                  numinfls = len(re.findall(r"\{\{inflection of\|", text))
                  if numinfls < 1:
                    pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename)
                  elif numinfls > 1:
                    pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename)
                  else:
                    comment = "Delete erroneously created long form of %s" % pagetitle
                    pagemsg("Existing text for form %s: [[%s]]" % (
                      formpagename, text))
                    if save:
                      formpage.delete(comment)
                    else:
                      pagemsg("Would delete page %s with comment=%s" %
                          (formpagename, comment))

      notes.append("fix 3olda -> %s" % direc)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return unicode(parsed), notes
Esempio n. 19
0
def rewrite_one_page_arz_headword(page, index, text):
    temps_changed = []
    for t in text.filter_templates():
        if unicode(t.name) == "arz-noun":
            head = getparam(t, "head")
            rmparam(t, "head")
            tr = getparam(t, "tr")
            rmparam(t, "tr")
            sort = getparam(t, "sort")
            rmparam(t, "sort")
            g = getparam(t, "g")
            rmparam(t, "g")
            g2 = getparam(t, "g2")
            rmparam(t, "g2")
            pl = getparam(t, "2")
            rmparam(t, "2")
            pltr = getparam(t, "3")
            rmparam(t, "3")
            addparam(t, "1", head)
            addparam(t, "2", g)
            if g2:
                addparam(t, "g2", g2)
            if tr:
                addparam(t, "tr", tr)
            if pl:
                addparam(t, "pl", pl)
            if pltr:
                addparam(t, "pltr", pltr)
            if sort:
                addparam(t, "sort", sort)
            temps_changed.append("arz-noun")
        elif unicode(t.name) == "arz-adj":
            head = getparam(t, "head")
            rmparam(t, "head")
            tr = getparam(t, "tr")
            rmparam(t, "tr")
            sort = getparam(t, "sort")
            rmparam(t, "sort")
            pl = getparam(t, "pwv") or getparam(t, "p")
            rmparam(t, "pwv")
            rmparam(t, "p")
            pltr = getparam(t, "ptr")
            rmparam(t, "ptr")
            f = getparam(t, "fwv") or getparam(t, "f")
            rmparam(t, "fwv")
            rmparam(t, "f")
            ftr = getparam(t, "ftr")
            rmparam(t, "ftr")
            addparam(t, "1", head)
            if tr:
                addparam(t, "tr", tr)
            if f:
                addparam(t, "f", f)
            if ftr:
                addparam(t, "ftr", ftr)
            if pl:
                addparam(t, "pl", pl)
            if pltr:
                addparam(t, "pltr", pltr)
            if sort:
                addparam(t, "sort", sort)
            temps_changed.append("arz-adj")
    return text, "rewrite %s to new style" % ", ".join(temps_changed)
Esempio n. 20
0
def process_page(page, index, do_fix):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname in ["ru-conj", "ru-conj-old"]:
            if [x for x in t.params if unicode(x.value) == "or"]:
                pagemsg("WARNING: Skipping multi-arg conjugation: %s" %
                        unicode(t))
                continue
            conjtype = getparam(t, "2")
            if tname == "ru-conj":
                tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms",
                                  unicode(t))
            else:
                tempcall = re.sub(r"\{\{ru-conj-old",
                                  "{{ru-generate-verb-forms|old=y", unicode(t))
            result = expand_text(tempcall)
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                continue
            args = blib.split_generate_args(result)
            for base in ["past_pasv_part", "ppp"]:
                forms_to_remove = []
                if args[base] == "-":
                    continue
                for form in re.split(",", args[base]):
                    origform = form
                    form = re.sub("//.*", "", form)
                    fix_form = False
                    if not re.search(ur"([аяеё]́?нный|тый)$", form):
                        pagemsg(
                            "WARNING: Past passive participle doesn't end correctly: %s"
                            % form)
                        fix_form = True
                    unstressed_page = rulib.make_unstressed_ru(pagetitle)
                    unstressed_form = rulib.make_unstressed_ru(form)
                    warned = False
                    if unstressed_form[0] != unstressed_page[0]:
                        pagemsg(
                            "WARNING: Past passive participle doesn't begin with same letter, probably for wrong aspect: %s"
                            % form)
                        warned = True
                        fix_form = True
                    if form.endswith(u"нный"):
                        if pagetitle.endswith(u"ать"):
                            good_ending = u"анный"
                        elif pagetitle.endswith(u"ять"):
                            good_ending = u"янный"
                        else:
                            good_ending = u"енный"
                        if not unstressed_form.endswith(good_ending):
                            pagemsg(
                                "WARNING: Past passive participle doesn't end right, probably for wrong aspect: %s"
                                % form)
                            warned = True
                            fix_form = True
                    if not warned:
                        correct_form = form_ppp(conjtype, pagetitle, args)
                        if correct_form and unstressed_form != correct_form:
                            pagemsg(
                                "WARNING: Past passive participle not formed according to rule, probably wrong: found %s, expected %s"
                                % (unstressed_form, correct_form))
                            fix_form = True
                    if fix_form:
                        forms_to_remove.append(origform)
                if forms_to_remove and do_fix:
                    curvals = []
                    for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]:
                        val = getparam(t, base + i)
                        if val:
                            curvals.append(val)
                    newvals = [x for x in curvals if x not in forms_to_remove]
                    if len(curvals) - len(newvals) != len(forms_to_remove):
                        pagemsg(
                            "WARNING: Something wrong, couldn't remove all PPP forms %s"
                            % ",".join(forms_to_remove))
                    curindex = 1
                    origt = unicode(t)
                    for newval in newvals:
                        t.add(base + ("" if curindex == 1 else str(curindex)),
                              newval)
                        curindex += 1
                    for i in xrange(curindex, 10):
                        rmparam(t, base + ("" if i == 1 else str(i)))
                    pagemsg("Replacing %s with %s" % (origt, unicode(t)))
                    notes.append("removed bad past pasv part(s) %s" %
                                 ",".join(forms_to_remove))
Esempio n. 21
0
def process_page(page, index, parsed):
    global args
    verbose = args.verbose
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    parsed = blib.parse(page)

    headword_template = None
    see_template = None
    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
            if headword_template:
                pagemsg("WARNING: Multiple headword templates, skipping")
                return
            headword_template = t
        if unicode(t.name) in ["ru-decl-noun-see"]:
            if see_template:
                pagemsg(
                    "WARNING: Multiple ru-decl-noun-see templates, skipping")
                return
            see_template = t
    if not headword_template:
        pagemsg("WARNING: No ru-noun+ or ru-proper noun+ templates, skipping")
        return
    if not see_template:
        pagemsg("WARNING: No ru-decl-noun-see templates, skipping")
        return

    del see_template.params[:]
    for param in headword_template.params:
        see_template.add(param.name, param.value)
    see_template.name = "ru-noun-table"

    if unicode(headword_template.name) == "ru-proper noun+":
        # Things are trickier for proper nouns because they default to n=sg, whereas
        # ru-noun-table defaults to n=both. We have to expand both templates and
        # fetch the value of n, and set it in ru-noun-table if not the same.

        # 1. Generate args for headword proper-noun template, using |ndef=sg
        #    because ru-proper noun+ defaults to sg and ru-generate-noun-args
        #    would otherwise default to both.
        headword_generate_template = re.sub(r"^\{\{ru-proper noun\+",
                                            "{{ru-generate-noun-args",
                                            unicode(headword_template))
        headword_generate_template = re.sub(r"\}\}$", "|ndef=sg}}",
                                            headword_generate_template)
        headword_generate_result = expand_text(headword_generate_template)
        if not headword_generate_result:
            pagemsg("WARNING: Error generating ru-proper noun+ args")
            return None
        # 2. Fetch actual value of n.
        headword_args = blib.split_generate_args(headword_generate_result)
        headword_n = headword_args["n"]
        # 3. If sg, we always need to set n=sg explicitly in ru-noun-table.
        if headword_n == "s":
            see_template.add("n", "sg")
        # 4. If pl, leave alone, since both will default to plural only if the
        #    lemma is pl, else n=pl needs to be set for both.
        elif headword_n == "p":
            pass
        # 5. If both, n=both had to have been set explicitly in the headword,
        #    but it's the default in ru-noun-table unless the lemma is plural.
        #    So remove n=both, generate the arguments, and see if the actual
        #    value of args.n is b (for "both"); if not, set n=both.
        else:
            assert headword_n == "b"
            rmparam(see_template, "n")
            see_generate_template = re.sub(r"^\{\{ru-noun-table",
                                           "{{ru-generate-noun-args",
                                           unicode(see_template))
            see_generate_result = expand_text(see_generate_template)
            if not see_generate_result:
                pagemsg("WARNING: Error generating ru-noun-table args")
                return None
            see_args = blib.split_generate_args(see_generate_result)
            if see_args["n"] != "b":
                see_template.add("n", "both")

    return unicode(
        parsed
    ), "Replace ru-decl-noun-see with ru-noun-table, taken from headword template (%s)" % unicode(
        headword_template.name)
Esempio n. 22
0
def process_page(index, page, direc, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  origdirec = direc
  for t in parsed.filter_templates():
    origt = unicode(t)
    direc = origdirec
    if unicode(t.name) in ["ru-conj-7b"]:
      rmparam(t, "past_m")
      rmparam(t, "past_f")
      rmparam(t, "past_n")
      rmparam(t, "past_pl")
      rmparam(t, "notes")
      rmparam(t, "past_adv_part")
      rmparam(t, "past_adv_part2")
      rmparam(t, "past_adv_part_short")
      #ppps = blib.fetch_param_chain(t, "past_pasv_part", "past_pasv_part")
      #blib.remove_param_chain(t, "past_pasv_part", "past_pasv_part")
      presstem = getparam(t, "3")
      rmparam(t, "5")
      rmparam(t, "4")
      rmparam(t, "3")
      npp = "npp" in direc
      direc = direc.replace("npp", "")
      yo = u"ё" in direc
      direc = direc.replace(u"ё", "")
      direc = re.sub("7b/?", "", direc)
      if re.search(u"е́?[^аэыоуяеиёю]*$", presstem):
        if not yo:
          pagemsg(u"Something wrong, е-stem present and no ё directive")
        if npp:
          presstem = ru.make_ending_stressed(presstem)
        else:
          presstem = re.sub(u"е́?([^аэыоуяеиёю]*)$", ur"ё\1", presstem)
      else:
        presstem = ru.make_ending_stressed(presstem)
      pap = getparam(t, "past_actv_part")
      pred_pap = presstem + u"ший"
      if direc not in ["b", "b(9)"] and re.search(u"[дт]$", presstem):
        pred_pap = re.sub(u"[дт]$", "", presstem) + u"вший"
      if pap:
        if pap == pred_pap:
          pagemsg("Removing past_actv_part=%s because same as predicted" % pap)
          rmparam(t, "past_actv_part")
        else:
          pagemsg("Not removing unpredictable past_actv_part=%s (predicted %s)" %
              (pap, pred_pap))
      for param in t.params:
        if not re.search("^([0-9]+$|past_pasv_part)", unicode(param.name)):
          pagemsg("Found additional named param %s" % unicode(param))
      t.add("3", presstem)
      if direc:
        t.add("4", "")
        t.add("5", direc)
      blib.sort_params(t)
      #blib.set_param_chain(t, ppps, "past_pasv_part", "past_pasv_part")
      notes.append("set class-7b verb to directive %s%s" %
          (direc, npp and u" (no ё in present stem)" or ""))
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 23
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)

        def getp(param):
            return getparam(t, param)

        if tn == "pl-decl-adj-ki":
            param1 = getp("1")
            param2 = getp("2")
            blib.set_template_name(t, "pl-decl-adj-auto")
            rmparam(t, "2")
            rmparam(t, "1")
            if ":" in pagetitle and pagetitle != param1 + "ki":
                pagemsg(
                    "WARNING: Param 1=%s doesn't agree with pagetitle: %s" %
                    (param1, origt))
                t.add("1", param1 + "ki")
            if param2:
                t.add("olddat", param2)
            notes.append("Convert {{pl-decl-adj-ki}} to {{pl-decl-adj-auto}}")
        elif tn in ["pl-decl-adj-y", "pl-adj-y"]:
            if getp("head"):
                pagemsg("WARNING: Saw head=, not changing: %s" % origt)
            else:
                param1 = getp("1")
                blib.set_template_name(t, "pl-decl-adj-auto")
                rmparam(t, "2")
                rmparam(t, "1")
                if ":" in pagetitle and pagetitle != param1 + "y":
                    pagemsg(
                        "WARNING: Param 1=%s doesn't agree with pagetitle: %s"
                        % (param1, origt))
                    t.add("1", param1 + "y")
                notes.append("Convert {{%s}} to {{pl-decl-adj-auto}}" % tn)
        elif tn == "pl-decl-adj-i":
            param1 = getp("1")
            param2 = getp("2")
            blib.set_template_name(t, "pl-decl-adj-auto")
            rmparam(t, "2")
            rmparam(t, "1")
            if param1:
                if param2 in ["g", "gi"]:
                    should_pagetitle = param1 + "gi"
                elif param2 in ["l", "li"]:
                    should_pagetitle = param1 + "li"
                else:
                    should_pagetitle = param1 + "i"
                if ":" in pagetitle and pagetitle != should_pagetitle:
                    pagemsg(
                        "WARNING: Param 1=%s doesn't agree with pagetitle (pagetitle should be %s): %s"
                        % (param1, should_pagetitle, origt))
                    t.add("1", should_pagetitle)
            notes.append("Convert {{pl-decl-adj-i}} to {{pl-decl-adj-auto}}")
        elif tn == "pl-decl-adj-owy":
            param1 = getp("1")
            blib.set_template_name(t, "pl-decl-adj-auto")
            rmparam(t, "2")
            rmparam(t, "1")
            if ":" in pagetitle and pagetitle != param1 + "owy":
                pagemsg(
                    "WARNING: Param 1=%s doesn't agree with pagetitle: %s" %
                    (param1, origt))
                t.add("1", param1 + "owy")
            notes.append("Convert {{pl-decl-adj-owy}} to {{pl-decl-adj-auto}}")

    return unicode(parsed), notes
Esempio n. 24
0
def process_page(index, page, direc):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  origdirec = direc
  for t in parsed.filter_templates():
    origt = unicode(t)
    direc = origdirec
    if unicode(t.name) in ["ru-conj-7b"]:
      rmparam(t, "past_m")
      rmparam(t, "past_f")
      rmparam(t, "past_n")
      rmparam(t, "past_pl")
      rmparam(t, "notes")
      rmparam(t, "past_adv_part")
      rmparam(t, "past_adv_part2")
      rmparam(t, "past_adv_part_short")
      #ppps = blib.fetch_param_chain(t, "past_pasv_part", "past_pasv_part")
      #blib.remove_param_chain(t, "past_pasv_part", "past_pasv_part")
      presstem = getparam(t, "3")
      rmparam(t, "5")
      rmparam(t, "4")
      rmparam(t, "3")
      npp = "npp" in direc
      direc = direc.replace("npp", "")
      yo = u"ё" in direc
      direc = direc.replace(u"ё", "")
      direc = re.sub("7b/?", "", direc)
      if re.search(u"е́?[^аэыоуяеиёю]*$", presstem):
        if not yo:
          pagemsg(u"Something wrong, е-stem present and no ё directive")
        if npp:
          presstem = rulib.make_ending_stressed_ru(presstem)
        else:
          presstem = re.sub(u"е́?([^аэыоуяеиёю]*)$", ur"ё\1", presstem)
      else:
        presstem = rulib.make_ending_stressed_ru(presstem)
      pap = getparam(t, "past_actv_part")
      pred_pap = presstem + u"ший"
      if direc not in ["b", "b(9)"] and re.search(u"[дт]$", presstem):
        pred_pap = re.sub(u"[дт]$", "", presstem) + u"вший"
      if pap:
        if pap == pred_pap:
          pagemsg("Removing past_actv_part=%s because same as predicted" % pap)
          rmparam(t, "past_actv_part")
        else:
          pagemsg("Not removing unpredictable past_actv_part=%s (predicted %s)" %
              (pap, pred_pap))
      for param in t.params:
        if not re.search("^([0-9]+$|past_pasv_part)", unicode(param.name)):
          pagemsg("Found additional named param %s" % unicode(param))
      t.add("3", presstem)
      if direc:
        t.add("4", "")
        t.add("5", direc)
      blib.sort_params(t)
      #blib.set_param_chain(t, ppps, "past_pasv_part", "past_pasv_part")
      notes.append("set class-7b verb to directive %s%s" %
          (direc, npp and u" (no ё in present stem)" or ""))
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return unicode(parsed), notes
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    allow_2 = False
    lemma = None
    if tn in ["la-future participle", "la-perfect participle", "la-gerundive"]:
      base = getparam(t, "1")
      if tn == "la-gerundive":
        param2 = getparam(t, "2")
        if param2:
          if lalib.remove_macrons(base) == lalib.remove_macrons(param2):
            allow_2 = True
            base = param2
          else:
            pagemsg("WARNING: Unrecognized param 2: %s" % origt)
            continue
      if not base:
        pagemsg("WARNING: Empty param 1: %s" % origt)
        continue
      lemma = base + "us"
    elif tn == "la-present participle":
      base = getparam(t, "1")
      ending = getparam(t, "2")
      if not base:
        pagemsg("WARNING: Empty param 1: %s" % origt)
        continue
      if not ending:
        pagemsg("WARNING: Empty param 2: %s" % origt)
        continue
      if ending == "ans":
        lemma = base + u"āns"
      elif ending == "ens":
        lemma = base + u"ēns"
      elif ending == "iens":
        lemma = u"%siēns/%seunt" % (base, base)
      else:
        pagemsg("WARNING: Unrecognized param 2: %s" % origt)
        continue
      allow_2 = True
    if lemma:
      bad_param = False
      for param in t.params:
        pname = unicode(param.name)
        if pname.strip() == "1" or allow_2 and pname.strip() == "2":
          continue
        pagemsg("WARNING: Unrecognized param %s=%s: %s" % (
          pname, param.value, origt))
        bad_param = True
      if bad_param:
        continue
      rmparam(t, "2")
      t.add("1", lemma)
      blib.set_template_name(t, "la-part")
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append(u"convert {{%s}} to {{la-part}}" % tn)

  return unicode(parsed), notes
Esempio n. 26
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")
    notes = []

    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn in ["place:Brazil/municipality", "place:municipality of Brazil"]:
            state = getparam(t, "state")
            trans = getparam(t, "2")
            blib.set_template_name(t, "place")
            rmparam(t, "state")
            t.add("2", "municipality")
            t.add("3", "s/%s" % state)
            t.add("4", "c/Brazil")
            if trans:
                t.add("t", trans)
        if tn in ["place:Brazil/state", "place:state of Brazil"]:
            region = getparam(t, "region")
            capital = getparam(t, "capital")
            trans = getparam(t, "2")
            blib.set_template_name(t, "place")
            rmparam(t, "region")
            rmparam(t, "capital")
            t.add("2", "state")
            t.add("3", "r/%s" % region)
            t.add("4", "c/Brazil")
            t.add("capital", capital)
            if trans:
                t.add("t", trans)
        if tn in [
                "place:Brazil/state capital", "place:state capital of Brazil"
        ]:
            state = getparam(t, "state")
            trans = getparam(t, "2")
            blib.set_template_name(t, "place")
            rmparam(t, "state")
            t.add("2", "municipality/state capital")
            t.add("3", "s/%s" % state)
            t.add("4", "c/Brazil")
            if trans:
                t.add("t", trans)
        if tn in ["place:Brazil/capital", "place:capital of Brazil"]:
            trans = getparam(t, "2")
            blib.set_template_name(t, "place")
            t.add("2", "municipality/capital city")
            t.add("3", "c/Brazil")
            t.add("4", ";")
            t.add("5", "state capital")
            t.add("6", "s/Distrito Federal")
            t.add("7", "c/Brazil")
            if trans:
                t.add("t", trans)
        newt = unicode(t)
        if origt != newt:
            notes.append("replace {{%s}} with {{place}}" % tn)
            pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
def process_text_on_page(pagetitle, index, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []

    subsections = re.split("(^==+[^=\n]+==+\n)", text, 0, re.M)
    for j in xrange(2, len(subsections), 2):
        if not re.search("==(Adjective|Numeral|Ordinal Numeral|Participle)==",
                         subsections[j - 1]):
            continue
        parsed = blib.parse_text(subsections[j])
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn in rename_templates_without_lang:
                lemma = getparam(t, "1")
                langparam = None
                lemmaparam = "1"
            elif tn in rename_templates_with_lang and t.has(
                    "lang") and getparam(t, "lang") == "lb":
                lemma = getparam(t, "1")
                langparam = "lang"
                lemmaparam = "1"
            elif tn in rename_templates_with_lang and not t.has(
                    "lang") and getparam(t, "1") == "lb":
                lemma = getparam(t, "2")
                langparam = "1"
                lemmaparam = "2"
            else:
                continue

            lemmas_to_try = [lemma]
            if lemma.endswith("e"):
                # lemma with a schwa
                lemmas_to_try.append(lemma[:-1])
            if lemma == "gutt":
                lemmas_to_try.append("gudd")

            ending_sets_to_try = [positive_ending_tags]

            endings_to_try = []
            for ending_sets in ending_sets_to_try:
                for ending, tag_sets in ending_sets.iteritems():
                    if pagetitle.endswith(ending):
                        endings_to_try.append((ending, tag_sets))
            if len(endings_to_try) == 0:
                pagemsg(
                    "WARNING: Can't identify ending of non-lemma form, skipping"
                )
                continue
            found_combinations = []
            for ending_to_try, tag_sets in endings_to_try:
                for lemma_to_try in lemmas_to_try:
                    if lemma_to_try + ending_to_try == pagetitle:
                        found_combinations.append(
                            (lemma_to_try, ending_to_try, tag_sets))
            if len(found_combinations) == 0:
                pagemsg(
                    "WARNING: Can't match lemma %s with page title (tried lemma variants %s and endings %s), skipping"
                    % (lemma, "/".join(lemmas_to_try), "/".join(
                        ending_to_try
                        for ending_to_try, tag_sets in endings_to_try)))
                continue
            if len(found_combinations) > 1:
                pagemsg(
                    "WARNING: Found multiple possible matching endings for lemma %s (found possibilities %s), skipping"
                    % (lemma, "/".join("%s+%s" %
                                       (lemmas_to_try, endings_to_try)
                                       for lemma_to_try, ending_to_try,
                                       tag_sets in found_combinations)))
                continue
            lemma_to_try, ending_to_try, tag_sets = found_combinations[0]
            # Erase all params.
            if langparam:
                rmparam(t, langparam)
            elif getparam(t, "lang") == "lb":
                # Sometimes |lang=lb redundantly occurs; remove it if so
                rmparam(t, "lang")
            rmparam(t, lemmaparam)
            tr = getparam(t, "tr")
            rmparam(t, "tr")
            if len(t.params) > 0:
                pagemsg(
                    "WARNING: Original template %s has extra params, skipping"
                    % origt)
                return None, None
            # Set new name
            blib.set_template_name(t, "inflection of")
            # Put back new params.
            t.add("1", "lb")
            t.add("2", lemma)
            if tr:
                t.add("tr", tr)
            t.add("3", "")
            nextparam = 4
            for tag in "|;|".join(tag_sets).split("|"):
                t.add(str(nextparam), tag)
                nextparam += 1
            notes.append("replace %s with %s" % (origt, unicode(t)))
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))
        subsections[j] = unicode(parsed)
    text = "".join(subsections)

    return text, notes
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)

    head_template_tr = None
    head_auto_tr = None
    noun_head_template = None
    saw_ndecl = False
    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn in hindi_head_templates:
            if noun_head_template and head_template_tr and not saw_ndecl:
                pagemsg(
                    "WARNING: Missing declension for noun needing phonetic respelling, headtr=%s, autotr=%s: %s"
                    % (",".join(head_template_tr), ",".join(head_auto_tr),
                       unicode(noun_head_template)))
            if tn in ["hi-noun", "hi-proper noun"]:
                noun_head_template = t
            else:
                noun_head_template = None
            saw_ndecl = False
            head_template_tr = []
            head_auto_tr = []
            multi_trs = False
            for i in range(2, 10):
                if getparam(t, "tr%s" % i):
                    multi_trs = True
                    # We might have tr=some special translit and tr2=the default one, and in that case
                    # we don't want to remove tr2= even though it appears redundant.
                    pagemsg(
                        "More than one translit, not removing any redundant ones: %s"
                        % unicode(t))
                    break
            for i in range(1, 10):
                trparam = "tr" if i == 1 else "tr%s" % i
                origtr = getparam(t, trparam)
                tr = canonicalize_tr(origtr)
                if tr:
                    headparam = "head" if i == 1 else "head%s" % i
                    head = getparam(t, headparam)
                    if head:
                        head = blib.remove_links(head)
                    else:
                        head = pagetitle
                    autotr = expand_text("{{xlit|hi|%s}}" % head)
                    if autotr is not None:
                        if autotr == tr and not multi_trs:
                            assert i == 1
                            pagemsg(
                                "WARNING: Removing redundant translit tr=%s for head %s"
                                % (tr, head))
                            rmparam(t, "tr")
                            notes.append("remove redundant tr=%s from {{%s}}" %
                                         (tr, tn))
                        else:
                            head_template_tr.append(tr)
                            head_auto_tr.append(autotr)
                            pagemsg(
                                "Page has non-redundant translit %s=%s vs. auto tr=%s in {{%s}}"
                                % (trparam, tr, autotr, tn))
                            if origtr != tr:
                                pagemsg("Canonicalizing %s=%s to %s: %s" %
                                        (trparam, origtr, tn, unicode(t)))
                                t.add(trparam, tr)
                                notes.append(
                                    "canonicalize %s=%s to %s in {{%s}}" %
                                    (trparam, origtr, tr, tn))
            if unicode(t) != origt:
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
        if tn == "hi-ndecl":
            saw_ndecl = True
            decl = getparam(t, "1")
            phon_respellings = re.findall("//([^<>, -]*)", decl)
            if head_template_tr is None:
                pagemsg("WARNING: Saw {{hi-ndecl}} before any headwords: %s" %
                        unicode(t))
            else:
                respelling_tr = [
                    expand_text("{{xlit|hi|%s}}" % x) for x in phon_respellings
                ]
                if None in respelling_tr:
                    pagemsg(
                        "WARNING: Error during phonetic respelling translit, skipping"
                    )
                    continue
                respelling_tr = [x.replace(".", "") for x in respelling_tr]
                for phon_respelling in phon_respellings:
                    if u"॰" in phon_respelling:
                        pagemsg(u"WARNING: Saw ॰ in phon_respelling %s in %s" %
                                (phon_respelling, unicode(t)))
                if head_template_tr and not phon_respellings:
                    pagemsg(
                        "WARNING: Missing phonetic respelling in %s, headtr=%s, autotr=%s"
                        % (unicode(t), ",".join(head_template_tr),
                           ",".join(head_auto_tr)))
                elif phon_respellings and not head_template_tr:
                    pagemsg(
                        "WARNING: Extra phonetic respelling %s (translit %s) in %s, no head tr"
                        % (",".join(phon_respellings), ",".join(respelling_tr),
                           unicode(t)))
                elif set(respelling_tr) != set(head_template_tr):
                    pagemsg(
                        "WARNING: Phonetic respelling %s (translit %s) in %s differs from head translit %s, auto translit %s"
                        % (",".join(phon_respellings), ",".join(respelling_tr),
                           unicode(t), ",".join(head_template_tr),
                           ",".join(head_auto_tr)))
                elif phon_respellings:
                    pagemsg(
                        "Phonetic respelling %s (translit %s) in %s agrees with head translit %s, auto translit %s"
                        % (",".join(phon_respellings), ",".join(respelling_tr),
                           unicode(t), ",".join(head_template_tr),
                           ",".join(head_auto_tr)))

    if noun_head_template and head_template_tr and not saw_ndecl:
        pagemsg(
            "WARNING: Missing declension for noun needing phonetic respelling, headtr=%s, autotr=%s: %s"
            % (",".join(head_template_tr), ",".join(head_auto_tr),
               unicode(noun_head_template)))

    return unicode(parsed), notes
Esempio n. 29
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "autocat":
            blib.set_template_name(t, "auto cat")
            notes.append("{{autocat}} -> {{auto cat}}")
        elif tn in [
                "prefix cat", "suffix cat", "circumfix cat", "infix cat",
                "interfix cat"
        ]:
            m = re.search("^Category:(.*) ([a-z]+) ([a-z]+fix)ed with (.*)$",
                          pagetitle)
            if not m:
                pagemsg("WARNING: Can't parse page title")
                continue
            langname, pos, affixtype, term_and_id = m.groups()
            m = re.search(r"^(.*?) \((.*)\)$", term_and_id)
            if m:
                term, id = m.groups()
            else:
                term, id = term_and_id, ""
            t_lang = getparam(t, "1")
            t_term = getparam(t, "2")
            t_alt = getparam(t, "3")
            t_pos = getparam(t, "pos")
            t_id = getparam(t, "id")
            t_tr = getparam(t, "tr")
            t_sort = getparam(t, "sort")
            t_sc = getparam(t, "sc")
            if langname not in blib.languages_byCanonicalName:
                pagemsg("WARNING: Unrecognized language name: %s" % langname)
                continue
            if blib.languages_byCanonicalName[langname]["code"] != t_lang:
                pagemsg(
                    "WARNING: Auto-determined code %s for language name %s != manually specified %s"
                    % (blib.languages_byCanonicalName[langname]["code"],
                       langname, t_lang))
                continue
            if tn[:-4] != affixtype:
                pagemsg(
                    "WARNING: Auto-determined affix type %s != manually specified %s"
                    % (affixtype, tn[:-4]))
                continue

            def add_missing_hyphens(alt):
                hyph_c = "([" + possible_hyphens + "])"
                m = re.search(r"^(\*)(.*)$", alt)
                if m:
                    althyp, altbase = m.groups()
                else:
                    althyp, altbase = "", alt
                m = re.search(r"^(\*)(.*)$", term)
                if m:
                    termhyp, termbase = m.groups()
                else:
                    termhyp, termbase = "", term
                if affixtype == "suffix":
                    m = re.search("^" + hyph_c, termbase)
                    if m:
                        initial_hyphen = m.group(1)
                        if not altbase.startswith(initial_hyphen):
                            alt = althyp + initial_hyphen + altbase
                elif affixtype == "prefix":
                    m = re.search(hyph_c + "$", termbase)
                    if m:
                        final_hyphen = m.group(1)
                        if not altbase.endswith(final_hyphen):
                            alt = althyp + altbase + final_hyphen
                elif affixtype in ["infix", "interfix"]:
                    m = re.search("^" + hyph_c + ".*" + hyph_c + "$", termbase)
                    if m:
                        initial_hyphen, final_hyphen = m.groups()
                        if not altbase.startswith(initial_hyphen):
                            altbase = initial_hyphen + altbase
                        if not altbase.endswith(final_hyphen):
                            altbase = altbase + final_hyphen
                        alt = althyp + altbase
                return alt

            orig_t_term = t_term
            t_term = add_missing_hyphens(t_term)
            already_checked_t_alt = False
            if t_term != term:
                manual_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, t_term))
                if manual_entry_name != term:
                    pagemsg(
                        "WARNING: Can't match manually specified term %s (originally %s, entry name %s) to auto-determined term %s"
                        % (t_term, orig_t_term, manual_entry_name, term))
                    continue
                if t_alt:
                    pagemsg(
                        "WARNING: Manually specified term %s has extra diacritics and alt=%s also specified, skipping"
                        % (t_term, t_alt))
                    continue
                t_alt = t_term
                already_checked_t_alt = True
            if t_id != id:
                pagemsg(
                    "WARNING: Auto-determined ID %s != manually specified %s" %
                    (id, t_id))
                continue
            if (pos == "words" and t_pos not in ["", "word", "words"]
                    or pos != "words" and t_pos != pos and t_pos + "s" != pos
                    and (not t_pos.endswith("x") or t_pos + "es" != pos)):
                pagemsg(
                    "WARNING: Auto-determined pos %s doesn't match manually specified %s"
                    % (pos, t_pos))
                continue
            if t_alt and not already_checked_t_alt:
                orig_t_alt = t_alt
                t_alt = add_missing_hyphens(t_alt)
                manual_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, t_alt))
                if manual_entry_name != term:
                    pagemsg(
                        "WARNING: Can't match manually specified alt %s (originally %s, entry name %s) to auto-determined term %s"
                        % (t_alt, orig_t_alt, manual_entry_name, term))
                    continue
            if t_sort:
                auto_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, term))
                autosort = expand_text(
                    "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}"
                    % (t_lang, auto_entry_name))
                manual_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, add_missing_hyphens(t_sort)))
                manual_sort = expand_text(
                    "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}"
                    % (t_lang, manual_entry_name))
                if manual_sort != autosort:
                    pagemsg(
                        "Keeping sort key %s because canonicalized sort key %s based on it not same as canonicalized sort key %s based on term %s"
                        % (t_sort, manual_sort, autosort, term))
                else:
                    pagemsg(
                        "Discarding sort key %s because canonicalized sort key %s based on it same as canonicalized sort key based on term %s"
                        % (t_sort, manual_sort, term))
                    t_sort = ""

            must_continue = False
            all_existing_params = [
                "1", "2", "3", "tr", "pos", "id", "tr", "sc", "sort"
            ]
            for param in t.params:
                pn = pname(param)
                if pn not in all_existing_params:
                    pagemsg(
                        "WARNING: Unrecognized param %s=%s in affix cat: %s" %
                        (pn, unicode(param.value), unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue
            for param in all_existing_params:
                rmparam(t, param)
            blib.set_template_name(t, "auto cat")
            if t_alt:
                if t_alt == term:
                    pagemsg(
                        "Not adding alt=%s because it's the same as the term" %
                        t_alt)
                else:
                    t.add("alt", t_alt)
            if t_tr:
                t.add("tr", t_tr)
            if t_sort:
                t.add("sort", t_sort)
            if t_sc:
                t.add("sc", t_sc)
            notes.append("convert {{%s}} to {{auto cat}}" % tn)

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Esempio n. 30
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  notes = []

  if "sa-noun" not in text and "sa-decl-noun" not in text:
    return

  if ":" in pagetitle:
    pagemsg("Skipping non-mainspace title")
    return

  pagemsg("Processing")

  parsed = blib.parse_text(text)

  headt = None
  saw_decl = False

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)

    if tn == "sa-noun":
      pagemsg("Saw headt=%s" % unicode(t))
      if headt and not saw_decl:
        pagemsg("WARNING: Saw two {{sa-noun}} without {{sa-decl-noun}}: %s and %s" % (unicode(headt), unicode(t)))
      headt = t
      saw_decl = False
      continue

    if tn in ["sa-decl-noun", "sa-decl"]:
      pagemsg("WARNING: Saw raw {{%s}}: %s, headt=%s" % (tn, unicode(t), headt and unicode(headt) or None))
      continue

    if tn.startswith("sa-decl-noun-"):
      pagemsg("Saw declt=%s" % unicode(t))
      if not headt:
        pagemsg("WARNING: Saw {{%s}} without {{sa-noun}}: %s" % (tn, unicode(t)))
        continue
      saw_decl = True

      tr = getparam(headt, "tr")
      accented_tr = False
      if not tr:
        tr = expand_text("{{xlit|sa|%s}}" % pagetitle)
        pagemsg("WARNING: No translit in %s, using %s from pagetitle: declt=%s" % (unicode(headt), tr, unicode(t)))
      else:
        if "-" in tr:
          pagemsg("WARNING: Saw translit %s in head with hyphen: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t)))
          tr = tr.replace("-", "")
        decomptr = unicodedata.normalize("NFD", tr).replace("s" + AC, u"ś")
        if AC not in decomptr and GR not in decomptr:
          pagemsg("WARNING: Saw translit %s in head without accent: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t)))
        else:
          accented_tr = True
      genders = blib.fetch_param_chain(headt, "g")
      genders = [g.replace("-p", "").replace("bysense", "") for g in genders]
      genders = [g for gs in genders for g in (
        ["m", "f"] if gs in ["mf", "fm"] else ["m", "n"] if gs in ["mn", "nm"] else [gs]
      )]

      if tn in ["sa-decl-noun-m", "sa-decl-noun-f", "sa-decl-noun-n"]:
        tg = tn[-1]
        if tg not in genders:
          pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % (
            tg, ",".join(genders), unicode(headt), unicode(t)))
          continue

        decltr = getparam(t, "1")
        if not decltr:
          if not accented_tr:
            pagemsg("WARNING: No param in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("add (unaccented) translit %s to {{%s}}" % (tr, tn))
          else:
            pagemsg("WARNING: No param in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("add accented translit %s to {{%s}}" % (tr, tn))
        elif re.search(u"[\u0900-\u097F]", decltr): # translit is actually Devanagari
          if not accented_tr:
            pagemsg("WARNING: Devanagari in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("replace Devanagari in {{%s}} with (unaccented) translit %s" % (tr, tn))
          else:
            pagemsg("WARNING: Devanagari in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("replace Devanagari in {{%s}} with accented translit %s" % (tr, tn))
        else:
          decompdecltr = unicodedata.normalize("NFD", decltr).replace("s" + AC, u"ś")
          subbed = False
          if AC not in decompdecltr and GR not in decompdecltr:
            if accented_tr:
              pagemsg("WARNING: Saw translit %s in decl without accent, subbing accented tr %s from head: headt=%s, declt=%s" %
                  (decltr, tr, unicode(headt), unicode(t)))
              t.add("1", tr)
              notes.append("replace existing translit %s with accented translit %s in {{%s}}" % (decltr, tr, tn))
              subbed = True
            else:
              pagemsg("WARNING: Saw translit %s in decl without accent and unable to replace with accented tr from head: headt=%s, declt=%s" %
                  (decltr, unicode(headt), unicode(t)))
          if not subbed and "-" in decltr:
            pagemsg("WARNING: Saw translit %s in decl with hyphen: headt=%s, declt=%s" %
                (decltr, unicode(headt), unicode(t)))
            notes.append("remove hyphen from existing translit %s in {{%s}}" % (decltr, tn))
            decltr = decltr.replace("-", "")
            t.add("1", decltr)
            subbed = True
          stripped_decltr = decltr.strip()
          if "\n" not in decltr and stripped_decltr != decltr:
            pagemsg("WARNING: Saw translit '%s' in decl with extraneous space: headt=%s, declt=%s" %
                (decltr, unicode(headt), unicode(t)))
            notes.append("remove extraneous space from existing translit '%s' in {{%s}}" % (decltr, tn))
            decltr = stripped_decltr
            t.add("1", decltr)
            subbed = True
        continue

      if tn in [u"sa-decl-noun-ī", u"sa-decl-noun-ī-f"] and getparam(t, "mono"):
        pagemsg("WARNING: Saw mono=, skipping: headt=%s, declt=%s" % (unicode(headt), unicode(t)))
        continue

      if tn in old_template_to_gender:
        must_continue = False
        for param in t.params:
          pn = pname(param)
          if pn not in ["1", "2", "3", "4", "n"]:
            pagemsg("WARNING: Saw unknown param %s=%s in %s: headt=%s" % (pn, unicode(param.value), unicode(t),
              unicode(headt)))
            must_continue = True
            break
        if must_continue:
          continue

        g = old_template_to_gender[tn]
        if g not in genders:
          pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % (
            g, ",".join(genders), unicode(headt), unicode(t)))
          continue

        blib.set_template_name(t, "sa-decl-noun-%s" % g)
        rmparam(t, "n")
        rmparam(t, "4")
        rmparam(t, "3")
        rmparam(t, "2")
        t.add("1", tr)
        notes.append("convert {{%s}} to {{sa-decl-noun-%s}}" % (tn, g))
      else:
        pagemsg("WARNING: Saw unrecognized decl template: %s" % unicode(t))

    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  if headt:
    pagemsg("WARNING: Saw {{sa-noun}} without {{sa-decl-noun-*}}: %s" % unicode(headt))

  return unicode(parsed), notes
Esempio n. 31
0
def infer_one_page_decls_1(page, index, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, unicode(page.title()), txt))

    for tempname in decl_templates:
        for t in text.filter_templates():
            if unicode(t.name).strip() == tempname:
                orig_template = unicode(t)
                args = infer_decl(t, pagemsg)
                if not args:
                    # At least combine stem and declension, blanking decl when possible.
                    stem, decl = combine_stem(getparam(t, "1"),
                                              getparam(t, "2"))
                    t.add("1", stem)
                    t.add("2", decl)
                    # Remove any trailing blank arguments.
                    for i in xrange(15, 0, -1):
                        if not getparam(t, i):
                            rmparam(t, i)
                        else:
                            break
                    new_template = unicode(t)
                    if orig_template != new_template:
                        if not compare_results(orig_template, new_template,
                                               pagemsg):
                            return None, None
                else:
                    for i in xrange(15, 0, -1):
                        rmparam(t, i)
                    rmparam(t, "short_m")
                    rmparam(t, "short_f")
                    rmparam(t, "short_n")
                    rmparam(t, "short_p")
                    t.name = tempname
                    i = 1
                    for arg in args:
                        if "=" in arg:
                            name, value = re.split("=", arg)
                            t.add(name, value)
                        else:
                            t.add(i, arg)
                            i += 1
                    new_template = unicode(t)
                if orig_template != new_template:
                    if verbose:
                        pagemsg("Replacing %s with %s" %
                                (orig_template, new_template))

    return text, "Convert adj decl to new form and infer short-accent pattern"
Esempio n. 32
0
def la_get_headword_from_template(t, pagename, pagemsg, expand_text=None):
    if not expand_text:

        def expand_text(tempcall):
            return blib.expand_text(tempcall, pagename, pagemsg, False)

    tn = tname(t)
    if tn in [
            "la-adj", "la-part", "la-num-adj", "la-suffix-adj", "la-det",
            "la-pronoun"
    ]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            retval = getparam(t, "1")
            if "<" in retval or "((" in retval or " " in retval or "-" in retval:
                generate_template = blib.parse_text(
                    unicode(t)).filter_templates()[0]
                blib.set_template_name(generate_template,
                                       "la-generate-adj-forms")
                blib.remove_param_chain(generate_template, "comp", "comp")
                blib.remove_param_chain(generate_template, "sup", "sup")
                blib.remove_param_chain(generate_template, "adv", "adv")
                blib.remove_param_chain(generate_template, "lemma", "lemma")
                rmparam(generate_template, "type")
                # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it.
                rmparam(generate_template, "indecl")
                rmparam(generate_template, "id")
                rmparam(generate_template, "pos")
                result = expand_text(unicode(generate_template))
                if not result:
                    pagemsg("WARNING: Error generating forms, skipping")
                    retval = ""
                else:
                    args = blib.split_generate_args(result)
                    if "linked_nom_sg_m" in args:
                        retval = args["linked_nom_sg_m"]
                    elif "linked_nom_pl_m" in args:
                        retval = args["linked_nom_pl_m"]
                    else:
                        pagemsg(
                            "WARNING: Can't locate lemma in {{la-generate-adj-forms}} result: generate_template=%s, result=%s"
                            % (unicode(generate_template), result))
                        retval = ""
                    retval = retval.split(",")
            else:
                retval = re.sub("/.*", "", retval)
    elif tn in ["la-noun", "la-num-noun", "la-suffix-noun", "la-proper noun"]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            generate_template = blib.parse_text(
                unicode(t)).filter_templates()[0]
            blib.set_template_name(generate_template, "la-generate-noun-forms")
            blib.remove_param_chain(generate_template, "lemma", "lemma")
            blib.remove_param_chain(generate_template, "m", "m")
            blib.remove_param_chain(generate_template, "f", "f")
            blib.remove_param_chain(generate_template, "g", "g")
            rmparam(generate_template, "type")
            # FIXME: This is wrong, if indecl=1 then we shouldn't try to decline it.
            rmparam(generate_template, "indecl")
            rmparam(generate_template, "id")
            rmparam(generate_template, "pos")
            result = expand_text(unicode(generate_template))
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                retval = ""
            else:
                args = blib.split_generate_args(result)
                if "linked_nom_sg" in args:
                    retval = args["linked_nom_sg"]
                elif "linked_nom_pl" in args:
                    retval = args["linked_nom_pl"]
                else:
                    pagemsg(
                        "WARNING: Can't locate lemma in {{la-generate-noun-forms}} result: generate_template=%s, result=%s"
                        % (unicode(generate_template), result))
                    retval = ""
                retval = retval.split(",")
    elif tn in ["la-verb", "la-suffix-verb"]:
        retval = blib.fetch_param_chain(t, "lemma", "lemma")
        if not retval:
            generate_template = blib.parse_text(
                unicode(t)).filter_templates()[0]
            blib.set_template_name(generate_template, "la-generate-verb-forms")
            rmparam(generate_template, "id")
            result = expand_text(unicode(generate_template))
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
                retval = ""
            else:
                args = blib.split_generate_args(result)
                for slot in [
                        "linked_1s_pres_actv_indc", "linked_3s_pres_actv_indc",
                        "linked_1s_perf_actv_indc", "linked_3s_perf_actv_indc"
                ]:
                    if slot in args:
                        retval = args[slot]
                        break
                else:
                    # no break
                    pagemsg(
                        "WARNING: Can't locate lemma in {{la-generate-verb-forms}} result: generate_template=%s, result=%s"
                        % (unicode(generate_template), result))
                    retval = ""
                retval = retval.split(",")
    elif tn in la_adj_headword_templates or tn in la_adv_headword_templates or (
            tn in ["la-suffix", "la-suffix-adv", "la-gerund"]):
        retval = getparam(t, "1")
    elif tn == "la-letter":
        retval = pagename
    elif tn in ["head", "la-prep"]:
        retval = blib.fetch_param_chain(t, "head", "head")
    elif tn in la_nonlemma_headword_templates or tn in la_misc_headword_templates:
        retval = blib.fetch_param_chain(t, "1", "head")
    else:
        pagemsg("WARNING: Unrecognized headword template %s" % unicode(t))
        retval = ""
    retval = retval or pagename
    if type(retval) is not list:
        retval = [retval]
    return retval
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = ru.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword and (not lemmas or pagetitle in lemmas):
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runoun.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runoun.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
Esempio n. 34
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []
    for t in parsed.filter_templates():
        if tname(t) == "RQ:Don Quixote" and getparam(t,
                                                     "lang").strip() == "fr":
            origt = unicode(t)
            blib.set_template_name(t, "RQ:Cervantes Viardot Don Quichotte")
            rmparam(t, "lang")
            volume = getparam(t, "volume").strip()
            rmparam(t, "volume")
            if volume == "2":
                volume = "II"
            if not volume:
                volume = "I"
            chapter = getparam(t, "chapter").strip()
            rmparam(t, "chapter")
            text = getparam(t, "text").strip() or getparam(t,
                                                           "passage").strip()
            rmparam(t, "text")
            rmparam(t, "passage")
            translation = getparam(t, "t").strip() or getparam(
                t, "translation").strip()
            rmparam(t, "t")
            rmparam(t, "translation")
            # Fetch all params.
            numbered_params = []
            named_params = []
            for param in t.params:
                pname = unicode(param.name)
                if re.search("^[0-9]+$", pname):
                    numbered_params.append((pname, param.value, param.showkey))
                else:
                    named_params.append((pname, param.value, param.showkey))
            # Erase all params.
            del t.params[:]
            # Put numbered params in order.
            for name, value, showkey in numbered_params:
                t.add(name, value, showkey=showkey, preserve_spacing=False)
            t.add("volume", volume)
            if chapter:
                t.add("chapter", chapter)
            if text:
                t.add("text", text)
            if translation:
                t.add("t", translation)
            # Put named params in order.
            for name, value, showkey in named_params:
                t.add(name, value, showkey=showkey, preserve_spacing=False)
            notes.append(
                "Replace {{RQ:Don Quixote}} with {{RQ:Cervantes Viardot Don Quichotte}}"
            )
            newt = unicode(t)
            if origt != newt:
                pagemsg("Replaced %s with %s" % (origt, newt))

    return parsed, notes
Esempio n. 35
0
def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    parsed = blib.parse_text(secbody)
    saw_noun = None
    saw_proper_noun = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-noun":
            if saw_noun:
                pagemsg(
                    "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_noun), unicode(t)))
                return
            saw_noun = t
        elif tn == "la-proper noun":
            if saw_proper_noun:
                pagemsg(
                    "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_proper_noun), unicode(t)))
                return
            saw_proper_noun = t
    if saw_noun and saw_proper_noun:
        pagemsg(
            "WARNING: Saw both noun and proper noun, can't correct header/headword"
        )
        return
    if not saw_noun and not saw_proper_noun:
        pagemsg(
            "WARNING: Saw neither noun nor proper noun, can't correct header/headword"
        )
        return
    pos = "pn" if saw_proper_noun else "n"
    ht = saw_proper_noun or saw_noun
    if getparam(ht, "indecl"):
        pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht))
        return
    generate_template = blib.parse_text(unicode(ht)).filter_templates()[0]
    blib.set_template_name(generate_template, "la-generate-noun-forms")
    blib.remove_param_chain(generate_template, "lemma", "lemma")
    blib.remove_param_chain(generate_template, "m", "m")
    blib.remove_param_chain(generate_template, "f", "f")
    blib.remove_param_chain(generate_template, "g", "g")
    rmparam(generate_template, "type")
    rmparam(generate_template, "indecl")
    rmparam(generate_template, "id")
    rmparam(generate_template, "pos")
    result = expand_text(unicode(generate_template))
    if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        return
    tempargs = blib.split_generate_args(result)
    forms_seen = set()
    slots_and_forms_to_process = []
    for slot, formarg in tempargs.iteritems():
        forms = formarg.split(",")
        for form in forms:
            if "[" in form or "|" in form:
                continue
            form_no_macrons = lalib.remove_macrons(form)
            if form_no_macrons == pagetitle:
                continue
            if form_no_macrons in forms_seen:
                continue
            forms_seen.add(form_no_macrons)
            slots_and_forms_to_process.append((slot, form))
    for index, (slot, form) in blib.iter_items(
            sorted(slots_and_forms_to_process,
                   key=lambda x: lalib.remove_macrons(x[1]))):

        def handler(page, index, parsed):
            return process_form(page, index, slot, form, pos)

        blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                     index,
                     handler,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)
Esempio n. 36
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    tname = unicode(t.name)
    if tname.startswith("ru-conj-") and tname != "ru-conj-verb-see":
      m = re.search("^ru-conj-(.*)$", tname)
      t.name = "ru-conj"
      conjtype = m.group(1)
      varargno = None
      variant = None
      if conjtype in ["3oa", "4a", "4b", "4c", "6a", "6c", "11a", "16a", "16b", u"irreg-дать", u"irreg-клясть", u"irreg-быть"]:
        varargno = 3
      elif conjtype in ["5a", "5b", "5c", "6b", "9a", "9b", "11b", "14a", "14b", "14c"]:
        varargno = 4
      elif conjtype in ["7b"]:
        varargno = 5
      elif conjtype in ["7a"]:
        varargno = 6
      if varargno:
        variant = getparam(t, str(varargno))
        if re.search("^[abc]", variant):
          variant = "/" + variant
        if getparam(t, str(varargno + 1)) or getparam(t, str(varargno + 2)) or getparam(t, str(varargno + 3)):
          t.add(str(varargno), "")
        else:
          rmparam(t, str(varargno))
        conjtype = conjtype + variant
      notes.append("ru-conj-* -> ru-conj, moving params up by one%s" %
          (variant and " (and move variant spec)" or ""))
      seenval = False
      for i in xrange(20, 0, -1):
        val = getparam(t, str(i))
        if val:
          seenval = True
        if seenval:
          t.add(str(i + 1), val)
      t.add("1", conjtype)
      blib.sort_params(t)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  def check_bad_head(text, arg):
    canontext = re.sub(u"[׳’]", "'", blib.remove_links(text))
    canonpagetitle = re.sub(u"[׳’]", "'", pagetitle)
    if canontext != canonpagetitle:
      pagemsg("WARNING: Canonicalized %s=%s not same as canonicalized page title %s (orig %s=%s)" %
          (arg, canontext, canonpagetitle, arg, text))

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if name in fr_head_templates:
      head = getparam(t, "head")
      if head:
        linked_pagetitle = link_text(pagetitle)
        linked_head = link_text(head)
        if linked_pagetitle == linked_head:
          pagemsg("Removing redundant head=%s" % head)
          rmparam(t, "head")
          notes.append("remove redundant head= from {{%s}}" % name)
        else:
          pagemsg("Not removing non-redundant head=%s" % head)
          check_bad_head(head, "head")
    if name in fr_head_or_1_templates:
      head = getparam(t, "1")
      if head:
        linked_pagetitle = link_text(pagetitle)
        linked_head = link_text(head)
        if linked_pagetitle == linked_head:
          pagemsg("Removing redundant 1=%s" % head)
          rmparam(t, "1")
          notes.append("remove redundant 1= from {{%s}}" % name)
        else:
          pagemsg("Not removing non-redundant 1=%s" % head)
          check_bad_head(head, "1")

    newt = unicode(t)
    if origt != newt:
      pagemsg("Replacing %s with %s" % (origt, newt))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 38
0
def rewrite_one_page_ru_decl_adj(page, index, text):
  oldtemps = []
  pagename = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
  for t in text.filter_templates():
    converted = True
    def tname():
      return unicode(t.name).strip()
    origname = tname()
    origtemplate = unicode(t)
    if tname() == "ru-adj-table":
      t.name = "ru-decl-adj"
    else:
      if re.match("^ru-adjective[0-9]", tname()):
        t.name = tname().replace("ru-adjective", "ru-adj")
      if tname() == "ru-passive participle decl":
        t.name = "ru-adj1"
      suffix = None
      if tname() == "ru-adj3-sja":
        suffix = u"ся"
        t.name = "ru-adj3"
      elif tname() == "ru-adj5-suffix":
        suffix = "-" + getparam(t, "8")
        t.name = "ru-adj5"
      if tname() in ending_for_ru_adj:
        if tname() == "ru-adj13":
          addparam(t, "2", ending_for_ru_adj[tname()])
          rmparam(t, "8")
          rmparam(t, "7")
          rmparam(t, "6")
          rmparam(t, "5")
          rmparam(t, "4")
          rmparam(t, "3")
        elif tname() in ["ru-adj7", "ru-adj8", "ru-adj9", "ru-adj12"]:
          addparam(t, "1", getparam(t, "2").strip())
          addparam(t, "2", ending_for_ru_adj[tname()])
          rmparam(t, "8")
          rmparam(t, "7")
          rmparam(t, "6")
          rmparam(t, "5")
          rmparam(t, "4")
          rmparam(t, "3")
        else:
          addparam(t, "1", getparam(t, "2").strip())
          addparam(t, "2", ending_for_ru_adj[tname()])
          mshort = clean(getparam(t, "3"))
          if mshort and re.search(u"[аяоеыи]$", remove_diacritics(mshort)):
            pagemsg("WARNING: short masculine %s doesn't have right ending" %
                mshort)
          fshort = clean(getparam(t, "4"))
          if fshort and not re.search(u"[ая]$", remove_diacritics(fshort)):
            pagemsg("WARNING: short feminine %s doesn't have right ending" %
                fshort)
          nshort = clean(getparam(t, "5"))
          if nshort and not re.search(u"[ое]$", remove_diacritics(nshort)):
            pagemsg("WARNING: short neuter %s doesn't have right ending" %
                nshort)
          pshort = clean(getparam(t, "6"))
          if pshort and not re.search(u"[ыи]$", remove_diacritics(pshort)):
            pagemsg("WARNING: short plural %s doesn't have right ending" %
                pshort)
          rmparam(t, "8")
          rmparam(t, "7")
          rmparam(t, "6")
          rmparam(t, "5")
          rmparam(t, "4")
          rmparam(t, "3")
          if mshort:
            addparam(t, "3", mshort)
          # Note that fshort and nshort get reversed
          if nshort:
            addparam(t, "4", nshort)
          if fshort:
            addparam(t, "5", fshort)
          if pshort:
            addparam(t, "6", pshort)
        if suffix:
          addparam(t, "suffix", suffix)
        t.name = "ru-decl-adj"
        pagemsg("Rewrote %s as %s" % (origtemplate, unicode(t)))
      else:
        converted = False
    if converted:
      oldtemps.append(origname)
  if oldtemps:
    comment = "convert %s -> ru-decl-adj" % ", ".join(oldtemps)
  else:
    comment = None
  return text, comment
Esempio n. 39
0
def process_page(index, page, romaji_to_keep):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]:
            origt = unicode(t)

            # Remove old script code
            p1 = getparam(t, "1")
            if p1 in ["r", "h", "ka", "k", "s", "ky", "kk"]:
                pagemsg("Removing 1=%s: %s" % (p1, unicode(t)))
                notes.append("remove 1=%s from %s" % (p1, tname))
                rmparam(t, "1")
                for param in t.params:
                    pname = unicode(param.name)
                    if re.search(r"^[0-9]+$", pname):
                        param.name = str(int(pname) - 1)
                        param.showkey = False

            # Convert hira= and/or kata= to numbered param. The complexity is
            # from ensuring that the numbered params always go before the
            # non-numbered ones.
            if t.has("hira") or t.has("kata"):
                # Fetch the numbered and non-numbered params, skipping blank
                # numbered ones and converting hira and kata to numbered
                numbered_params = []
                non_numbered_params = []
                for param in t.params:
                    pname = unicode(param.name)
                    if re.search(r"^[0-9]+$", pname):
                        val = unicode(param.value)
                        if val:
                            numbered_params.append(val)
                    elif pname not in ["hira", "kata"]:
                        non_numbered_params.append((pname, param.value))
                hira = getparam(t, "hira")
                if hira:
                    numbered_params.append(hira)
                    pagemsg("Moving hira=%s to %s=: %s" %
                            (hira, len(numbered_params), unicode(t)))
                    notes.append("move hira= to %s= in %s" %
                                 (len(numbered_params), tname))
                kata = getparam(t, "kata")
                if kata:
                    numbered_params.append(kata)
                    pagemsg("Moving kata=%s to %s=: %s" %
                            (kata, len(numbered_params), unicode(t)))
                    notes.append("move kata= to %s= in %s" %
                                 (len(numbered_params), tname))
                del t.params[:]
                # Put back numbered params, then non-numbered params.
                for i, param in enumerate(numbered_params):
                    t.add(str(i + 1), param)
                for name, value in non_numbered_params:
                    t.add(name, value)

            # Remove rom= if not in list of pages to keep rom=
            if t.has("rom"):
                if pagetitle in romaji_to_keep:
                    pagemsg("Keeping rom=%s because in romaji_to_keep: %s" %
                            (getparam(t, "rom"), unicode(t)))
                else:
                    pagemsg("Removing rom=%s: %s" %
                            (getparam(t, "rom"), unicode(t)))
                    rmparam(t, "rom")
                    notes.append("remove rom= from %s" % tname)

            # Remove hidx=
            if t.has("hidx"):
                pagemsg("Removing hidx=%s: %s" %
                        (getparam(t, "hidx"), unicode(t)))
                rmparam(t, "hidx")
                notes.append("remove hidx= from %s" % tname)

            newt = unicode(t)
            if origt != newt:
                pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
Esempio n. 40
0
def process_page(index, page, save, verbose, fix_missing_plurals):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if name == "head" and getparam(t, "1") == "fr":
      headtype = getparam(t, "2")
      fixed_plural_warning = False
      if headtype == "noun":
        head = getparam(t, "head")
        g = getparam(t, "g")
        g2 = getparam(t, "g2")
        plural = ""
        if getparam(t, "3") == "plural":
          plural = getparam(t, "4")
        unrecognized_params = False
        for param in t.params:
          pname = unicode(param.name)
          if pname in ["1", "2", "head", "g", "g2", "sort"] or plural and pname in ["3", "4"]:
            pass
          else:
            unrecognized_params = True
            break
        if unrecognized_params:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping"
              % unicode(t))
          continue
        if not g:
          pagemsg("WARNING: No gender given in %s, skipping" % unicode(t))
          continue
        found_feminine_noun = False
        if g == "f" and not g2 and not plural:
          for tt in parsed.filter_templates():
            if (unicode(tt.name) == "feminine noun of" and
                getparam(tt, "lang") == "fr"):
              found_feminine_noun = True
        if found_feminine_noun:
          pagemsg("Found 'feminine noun of', assuming countable")
        elif g not in ["m-p", "f-p"] and not plural:
          if fix_missing_plurals:
            pagemsg("WARNING: No plural given in %s, assuming default plural, PLEASE REVIEW"
                % unicode(t))
            fixed_plural_warning = True
          else:
            pagemsg("WARNING: No plural given in %s, skipping" % unicode(t))
            continue
        rmparam(t, "4")
        rmparam(t, "3")
        rmparam(t, "2")
        rmparam(t, "1")
        rmparam(t, "head")
        rmparam(t, "g")
        rmparam(t, "g2")
        rmparam(t, "sort")
        t.name = "fr-noun"
        if head:
          t.add("head", head)
        t.add("1", g)
        if g2:
          t.add("g2", g2)
        if plural:
          t.add("2", plural)
      elif headtype in ["proper noun", "proper nouns"]:
        head = getparam(t, "head")
        g = getparam(t, "g")
        g2 = getparam(t, "g2")
        remove_3 = False
        if not g and getparam(t, "3") in ["m", "f", "m-p", "f-p"]:
          g = getparam(t, "3")
          remove_3 = True
        unrecognized_params = False
        for param in t.params:
          pname = unicode(param.name)
          if pname in ["1", "2", "head", "g", "g2", "sort"] or remove_3 and pname in ["3"]:
            pass
          else:
            unrecognized_params = True
            break
        if unrecognized_params:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping"
              % unicode(t))
          continue
        if not g:
          pagemsg("WARNING: No gender given in %s, skipping" % unicode(t))
          continue
        rmparam(t, "3")
        rmparam(t, "2")
        rmparam(t, "1")
        rmparam(t, "head")
        rmparam(t, "g")
        rmparam(t, "g2")
        rmparam(t, "sort")
        t.name = "fr-proper noun"
        if head:
          t.add("head", head)
        t.add("1", g)
        if g2:
          t.add("g2", g2)
      elif headtype in ["adjective", "adjectives"]:
        if getparam(t, "3") in ["invariable", "invariant"]:
          params = dict((unicode(p.name), unicode(p.value)) for p in t.params)
          del params["1"]
          del params["2"]
          del params["3"]
          if getparam(t, "g") == "m" and getparam(t, "g2") == "f":
            del params["g"]
            del params["g2"]
          if not params:
            rmparam(t, "g2")
            rmparam(t, "g")
            rmparam(t, "3")
            rmparam(t, "2")
            rmparam(t, "1")
            t.name = "fr-adj"
            t.add("inv", "y")
          else:
            pagemsg("WARNING: Unrecognized parameters in %s, skipping" %
                unicode(t))
        else:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping" %
              unicode(t))
      elif headtype in ["adjective form", "verb form", "verb forms",
          "interjection", "preposition", "prefix", "prefixes",
          "suffix", "suffixes"]:
        headtype_supports_g = headtype in [
            "adjective form", "suffix", "suffixes"]
        head = getparam(t, "head")
        unrecognized_params = False
        for param in t.params:
          pname = unicode(param.name)
          if pname in ["1", "2", "head", "sort"] or headtype_supports_g and pname == "g":
            pass
          else:
            unrecognized_params = True
            break
        if unrecognized_params:
          pagemsg("WARNING: Unrecognized parameters in %s, skipping"
              % unicode(t))
          continue
        rmparam(t, "sort")
        rmparam(t, "head")
        rmparam(t, "2")
        rmparam(t, "1")
        t.name = ("fr-adj-form" if headtype == "adjective form" else
            "fr-verb-form" if headtype in ["verb form", "verb forms"] else
            "fr-intj" if headtype == "interjection" else
            "fr-prep" if headtype == "preposition" else
            "fr-prefix" if headtype in ["prefix", "prefixes"] else
            "fr-suffix" # if headtype in ["suffix", "suffixes"]
            )
        if head:
          t.add("head", head)

      newt = unicode(t)
      if origt != newt:
        pagemsg("Replacing %s with %s" % (origt, newt))
        notes.append("replaced {{head|fr|%s}} with {{%s}}%s" % (headtype,
          unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else ""))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      blib.try_repeatedly(lambda: page.save(comment=comment), pagemsg,
                    "save page")
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 41
0
def process_page(page, index, parsed, move_dot, rename):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")
  notes = []

  text = unicode(page.text)

  if ":" in pagetitle and not re.search(
      "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle):
    pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page")
    return None, None

  if move_dot:
    templates_to_replace = []

    for t in parsed.filter_templates():
      tn = tname(t)
      if tn in all_he_form_of_templates:
        dot = getparam(t, ".")
        if dot:
          origt = unicode(t)
          rmparam(t, ".")
          newt = unicode(t) + dot
          templates_to_replace.append((origt, newt))

    for curr_template, repl_template in templates_to_replace:
      found_curr_template = curr_template in text
      if not found_curr_template:
        pagemsg("WARNING: Unable to locate template: %s" % curr_template)
        continue
      found_repl_template = repl_template in text
      if found_repl_template:
        pagemsg("WARNING: Already found template with period: %s" % repl_template)
        continue
      newtext = text.replace(curr_template, repl_template)
      newtext_text_diff = len(newtext) - len(text)
      repl_curr_diff = len(repl_template) - len(curr_template)
      ratio = float(newtext_text_diff) / repl_curr_diff
      if ratio == int(ratio):
        if int(ratio) > 1:
          pagemsg("WARNING: Replaced %s occurrences of curr=%s with repl=%s"
              % (int(ratio), curr_template, repl_template))
      else:
        pagemsg("WARNING: Something wrong, length mismatch during replacement: Expected length change=%s, actual=%s, ratio=%.2f, curr=%s, repl=%s"
            % (repl_curr_diff, newtext_text_diff, ratio, curr_template,
              repl_template))
      text = newtext
      notes.append("move .= outside of {{he-*}} template")

  if rename:
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
      origt = unicode(t)
      tn = tname(t)
      if tn in all_he_form_of_template_map:
        newname, add_nocap = all_he_form_of_template_map[tn]
        add_nocap_msg = "|nocap=1" if add_nocap else ""
        newspecs = None
        if "|" in newname:
          newname, newspecs = newname.split("|")
        blib.set_template_name(t, newname)
        # Fetch all params.
        params = []
        old_1 = getparam(t, "1")
        for param in t.params:
          pname = unicode(param.name)
          if pname.strip() in ["1", "lang", "sc"]:
            continue
          if pname.strip() in (
            newname == "he-infinitive of" and ["3", "4"] or ["2", "3", "4"]
          ):
            errandmsg("WARNING: Found %s= in %s" % (pname.strip(), origt))
          params.append((pname, param.value, param.showkey))
        # Erase all params.
        del t.params[:]
        # Put back basic params
        t.add("1", old_1)
        if newname == "he-verb form of":
          assert newspecs
          t.add("2", newspecs)
          notes.append("rename {{%s}} to {{%s|{{{1}}}|%s%s}}" %
              (tn, newname, newspecs, add_nocap_msg))
        elif newname == "he-noun form of" and newspecs:
          newparam, newval = newspecs.split("=")
          t.add(newparam, newval)
          notes.append("rename {{%s}} to {{%s|{{{1}}}|%s=%s%s}}" %
              (tn, newname, newparam, newval, add_nocap_msg))
        else:
          notes.append("rename {{%s}} to {{%s%s}}" % (tn, newname, add_nocap_msg))
        # Put remaining parameters in order.
        for name, value, showkey in params:
          # More hacking for 'he-form of sing cons': p -> pp, g -> pg, n -> pn
          if newname == "he-noun form of" and newspecs:
            if name in ["p", "g", "n"]:
              name = "p" + name
          t.add(name, value, showkey=showkey, preserve_spacing=False)
        # Finally add nocap=1 if requested.
        if add_nocap:
          t.add("nocap", "1")

      if unicode(t) != origt:
        pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    text = unicode(parsed)

  return text, notes
Esempio n. 42
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []
  already_canonicalized = False
  found_short_inflection_of = False
  warned_about_short = False

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Try to canonicalize existing 'inflection of'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          # Fetch the numbered params starting with 3
          numbered_params = []
          for i in xrange(3,20):
            numbered_params.append(getparam(t, str(i)))
          while len(numbered_params) > 0 and not numbered_params[-1]:
            del numbered_params[-1]
          # Now canonicalize
          numparamstr = "/".join(numbered_params)
          canon_params = []
          while True:
            m = (re.search(r"^([mfn])/(?:s|\(singular\))/short(?: form|)$", numparamstr) or
                 re.search(r"^(?:s|\(singular\))/([mfn])/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/([mfn])/(?:s|\(singular\))$", numparamstr) or
                 re.search(r"^short(?: form|)/(?:s|\(singular\))/([mfn])$", numparamstr) or
                 re.search(r"^([mfn])/short(?: form|)/(?:s|\(singular\))$", numparamstr) or
                 re.search(r"^(?:s|\(singular\))/short(?: form|)/([mfn])$", numparamstr) or
                 re.search(r"^([mfn])/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/([mfn])$", numparamstr)
                 )
            if m:
              found_short_inflection_of = True
              canon_params = ["short", m.group(1), "s"]
              break
            m = (re.search(r"^(?:p|\(plural\))/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/(?:p|\(plural\))$", numparamstr)
                 )
            if m:
              found_short_inflection_of = True
              canon_params = ["short", "p"]
              break
            if "short" in numbered_params or "short form" in numbered_params:
              found_short_inflection_of = True
              warned_about_short = True
              pagemsg("WARNING: Apparent short-form 'inflection of' but can't canonicalize: %s" %
                  unicode(t))
            break
          if canon_params:
            origt = unicode(t)
            # Fetch param 1 and param 2. Erase all numbered params.
            # Put back param 1 and param 2 (this will put them after lang=ru),
            # then the replacements for the higher params.
            param1 = getparam(t, "1")
            param2 = getparam(t, "2")
            for i in xrange(19,0,-1):
              rmparam(t, str(i))
            t.add("1", param1)
            t.add("2", param2)
            for i, param in enumerate(canon_params):
              t.add(str(i+3), param)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params))
            else:
              pagemsg("Apparently already canonicalized: %s" % newt)
              already_canonicalized = True
      sections[j] = unicode(parsed)

      # Try to add 'inflection of' to raw-specified singular inflection
      def add_sing_inflection_of(m):
        prefix = m.group(1)
        gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f",
            "neuter":"n", "neutral":"n"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||short|%s|s}}" % (lemma, gender)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for short/%s/s" % gender)
        return retval
      newsec = re.sub(r"(# |\()'*(?:short |)(?:form of |)(masculine|male|feminine|female|neuter|neutral) (?:short |)(?:singular |)(?:short |)(?:form of|of|for)'* '*(?:\[\[|\{\{[lm]\|ru\|)(.*?)(?:\]\]|\}\})'*", add_sing_inflection_of,
          sections[j], 0, re.I)
      if newsec != sections[j]:
        found_short_inflection_of = True
      sections[j] = newsec

      if "short" in sections[j] and not found_short_inflection_of:
        m = re.search("^(.*short.*)$", sections[j], re.M)
        warned_about_short = True
        pagemsg("WARNING: Apparent raw-text short inflection, not converted: %s" %
            (m and m.group(1) or "Can't get line?"))

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)

  if not notes and not already_canonicalized:
    pagemsg("Skipping, no short form found%s" % (
      warned_about_short and " (warning issued)" or " (no warning)"))
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []

    text = unicode(page.text)
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn == "head" and getparam(t, "1") == "ang" and getparam(
                t, "2") in ["verb", "verbs"]:
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "2", "head"]:
                    pagemsg("WARNING: head|ang|verb with extra params: %s" %
                            unicode(t))
                    break
            else:
                # no break
                blib.set_template_name(t, "ang-verb")
                rmparam(t, "1")
                rmparam(t, "2")
                notes.append("convert {{head|ang|verb}} into {{ang-verb}}")
                head = getparam(t, "head")
                if head:
                    t.add("1", head)
                rmparam(t, "head")
        elif tn == "ang-verb":
            head = getparam(t, "head")
            head2 = getparam(t, "head2")
            head3 = getparam(t, "head3")
            rmparam(t, "head")
            rmparam(t, "head2")
            rmparam(t, "head3")
            if head:
                t.add("1", head)
            if head2:
                t.add("head2", head2)
            if head3:
                t.add("head3", head3)
            notes.append("move head= to 1= in {{ang-verb}}")
        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return parsed, notes
Esempio n. 44
0
    def fix_up_section(sectext):
        parsed = blib.parse_text(sectext)

        pronun_templates = []
        verb_templates = []
        nonverb_templates = []
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn in french_nonverb_head_templates:
                nonverb_templates.append(t)
            elif tn in french_verb_head_templates:
                verb_templates.append(t)
            elif tn == "head":
                if getparam(t, "1").strip() != "fr":
                    pagemsg(
                        "WARNING: Saw wrong-language {{head}} template: %s" %
                        unicode(t))
                else:
                    pos = getparam(t, "2").strip()
                    if pos in french_verb_head_pos:
                        verb_templates.append(t)
                    else:
                        nonverb_templates.append(t)
        if verb_templates and nonverb_templates:
            pagemsg(
                "WARNING: Saw both verb template(s) %s and non-verb template(s) %s, using pos=vnv"
                % (",".join(unicode(x) for x in verb_templates), ",".join(
                    unicode(x) for x in nonverb_templates)))
        if not verb_templates and not nonverb_templates:
            pagemsg("WARNING: Didn't see any French templates")
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "IPA":
                m = re.search("^.*?%s.*$" % re.escape(unicode(t)), sectext,
                              re.M)
                if not m:
                    pagemsg(
                        "WARNING: Couldn't find template %s in section text" %
                        unicode(t))
                    line = "(unknown)"
                else:
                    line = m.group(0)
                if t.has("lang"):
                    first_param = 1
                    lang = getparam(t, "lang")
                else:
                    first_param = 2
                    lang = getparam(t, "1")
                if lang != "fr":
                    pagemsg(
                        "WARNING: Saw wrong-language {{IPA}} template: %s in line <%s>"
                        % (unicode(t), line))
                    continue
                pron = getparam(t, str(first_param))
                if not pron:
                    pagemsg(
                        "WARNING: No pronun in {{IPA}} template: %s in line <%s>"
                        % (unicode(t), line))
                    continue
                if getparam(t, str(first_param + 1)) or getparam(
                        t, str(first_param + 2)) or getparam(
                            t, str(first_param + 3)):
                    pagemsg(
                        "WARNING: Multiple pronuns in {{IPA}} template: %s in line <%s>"
                        % (unicode(t), line))
                    continue
                pos_val = ("vnv" if verb_templates and nonverb_templates else
                           "v" if verb_templates else "")
                pos_arg = "|pos=%s" % pos_val if pos_val else ""
                #autopron = expand_text("{{#invoke:User:Benwing2/fr-pron|show|%s%s}}" % (
                autopron = expand_text("{{#invoke:fr-pron|show|%s%s}}" %
                                       (pagetitle, pos_arg))
                if not autopron:
                    continue
                pron = re.sub("^/(.*)/$", r"\1", pron)
                pron = re.sub(r"^\[(.*)\]$", r"\1", pron)
                pron = pron.strip()
                pron = pron.replace("r", u"ʁ")
                # account for various common errors in Dawnraybot's generated pronunciations:
                # #1
                if pagetitle.endswith("rez") and pron.endswith(u"ʁɔe"):
                    pron = re.sub(u"ʁɔe$", u"ʁe", pron)
                # #2
                if re.search("ai(s|t|ent)$",
                             pagetitle) and pron.endswith(u"e"):
                    pron = re.sub(u"e$", u"ɛ", pron)
                # #3
                if pos_val == "v" and pagetitle.endswith(
                        "ai") and pron.endswith(u"ɛ"):
                    pron = re.sub(u"ɛ$", u"e", pron)
                if "." not in pron:
                    autopron = autopron.replace(".", "")
                if autopron.endswith(u"ɑ") and pron.endswith("a"):
                    autopron = autopron[:-1] + "a"
                if re.search(ur"ɑ[mt]$", autopron) and re.search(
                        u"a[mt]$", pron):
                    autopron = re.sub(ur"ɑ([mt])$", r"a\1", autopron)
                for i in xrange(2):
                    # {{fr-IPA}} deletes schwa in the sequence V.Cə.CV esp. in the
                    # sequence V.Cə.ʁV in verbs, whereas the bot-generated pronunciation
                    # doesn't. We have separate cases depending on the identity of C,
                    # which may go before or after the syllable break. Do it twice in
                    # case it occurs twice in a row in a single word.
                    pron = re.sub(
                        ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([jlmnɲwʃʒ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])",
                        r"\1\2.\3", pron)
                    pron = re.sub(
                        ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([szfvtdpbkɡ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])",
                        r"\1.\2\3", pron)
                # {{fr-IPA}} converts sequences of Crj and Clj to Cri.j and Cli.j,
                # which is correct, but Dawnraybot doesn't do that.
                pron = re.sub(u"([szfvtdpbkɡ][ʁl])j", r"\1i.j", pron)
                allow_mismatch = False
                if pron != autopron:
                    tempcall = "{{fr-IPA%s}}" % pos_arg
                    if pron.replace(u"ɑ", "a") == autopron.replace(u"ɑ", "a"):
                        pagemsg(
                            u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɑ vs. a only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    elif re.sub(u"ɛ(.)", r"e\1",
                                pron) == re.sub(u"ɛ(.)", r"e\1", autopron):
                        pagemsg(
                            u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɛ vs. e only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    elif pron.replace(".", "") == autopron.replace(".", ""):
                        pagemsg(
                            "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable division only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                        allow_mismatch = True
                    elif pron.replace(".",
                                      "").replace(" ", "") == autopron.replace(
                                          ".", "").replace(" ", ""):
                        pagemsg(
                            "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable/word division only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    else:
                        pagemsg(
                            "WARNING: Can't replace %s with %s because auto-generated pron %s doesn't match %s: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    if not allow_mismatch:
                        continue
                origt = unicode(t)
                rmparam(t, "lang")
                rmparam(t, "1")
                rmparam(t, str(first_param))
                blib.set_template_name(t, "fr-IPA")
                if pos_val:
                    t.add("pos", pos_val)
                notes.append(
                    "replace manually-specified {{IPA|fr}} pronun with {{fr-IPA}}"
                )
                pagemsg("Replaced %s with %s: line <%s>" %
                        (origt, unicode(t), line))
                if "{{a|" in line:
                    pagemsg(
                        "WARNING: Replaced %s with %s on a line with an accent spec: line <%s>"
                        % (origt, unicode(t), line))
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  text = unicode(page.text)
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [
        "verb", "verbs", "adjective", "adjectives"]:
      pos = getparam(t, "2")
      if pos in ["verb", "verbs"]:
        newtn = "bg-verb"
      else:
        newtn = "bg-adj"
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if (pname not in ["1", "2", "head", "g"] or
            pname == "g" and (newtn != "bg-adj" or pval != "m")):
          pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt))
          break
      else: # no break
        rmparam(t, "1")
        rmparam(t, "2")
        rmparam(t, "g")
        head = getparam(t, "head")
        rmparam(t, "head")
        blib.set_template_name(t, newtn)
        t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle))
        notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn))
    elif tn == "bg-verb" or tn == "bg-adj":
      if tn == "bg-adj":
        g = getparam(t, "g")
        if g and g != "m":
          pagemsg("WARNING: Saw g=%s in %s" % (g, origt))
          continue
        if t.has("g"):
          rmparam(t, "g")
          notes.append("remove g=%s from {{%s}}" % (g, tn))
      head = getparam(t, "head") or getparam(t, "1")
      rmparam(t, "head")
      rmparam(t, "1")
      a = getparam(t, "a") or getparam(t, "2")
      rmparam(t, "a")
      rmparam(t, "2")
      if a in ["impf-pf", "pf-impf", "dual", "ip", "both"]:
        a = "both"
      elif a and a not in ["impf", "pf"]:
        pagemsg("WARNING: Unrecognized aspect %s in %s" % (a, origt))
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if not pval:
          continue
        params.append((pname, pval, showkey))
      # Erase all params.
      del t.params[:]
      # Put back new params.
      t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle))
      notes.append("move head= to 1= in {{%s}}" % tn)
      if a:
        t.add("2", a)
        notes.append("move a= to 2= in {{%s}}" % tn)
      for pname, pval, showkey in params:
        t.add(pname, pval, showkey=showkey, preserve_spacing=False)
    if unicode(t) != origt:
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  return parsed, notes
Esempio n. 46
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  text = unicode(page.text)
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [
        "noun", "nouns", "proper noun", "proper nouns"]:
      pos = getparam(t, "2")
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if (pname not in ["1", "2", "head", "g", "g2", "g3", "3", "4", "5", "6", "7", "8", "9", "10"] or
            pname == "3" and pval not in ["masculine", "feminine"] or
            pname in ["5", "7", "9"] and pval != "or"):
          pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt))
          break
      else: # no break
        rmparam(t, "1")
        rmparam(t, "2")
        m = []
        f = []
        head = getparam(t, "head")
        rmparam(t, "head")
        genders = []
        def process_gender(g):
          if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]:
            genders.append(g)
          else:
            pagemsg("WARNING: Unrecognized gender '%s'" % g)
        g = getparam(t, "g")
        if g:
          process_gender(g)
        rmparam(t, "g")
        g2 = getparam(t, "g2")
        if g2:
          process_gender(g2)
        rmparam(t, "g2")
        g3 = getparam(t, "g3")
        if g3:
          process_gender(g3)
        rmparam(t, "g3")
        def handle_mf(array):
          array.append(getparam(t, "4"))
          rmparam(t, "3")
          rmparam(t, "4")
          i = 5
          while getparam(t, str(i)) == "or":
            array.append(getparam(t, str(i + 1)))
            rmparam(t, str(i))
            rmparam(t, str(i + 1))
            i += 2
        if getparam(t, "3") == "masculine":
          handle_mf(m)
        if getparam(t, "3") == "feminine":
          handle_mf(f)
        if pos in ["noun", "nouns"]:
          newtn = "bg-noun"
        else:
          newtn = "bg-proper noun"
        blib.set_template_name(t, newtn)
        t.add("1", head or pagetitle)
        blib.set_param_chain(t, genders, "2", "g")
        if m:
          blib.set_param_chain(t, m, "m", "m")
        if f:
          blib.set_param_chain(t, f, "f", "f")
        notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn))
    elif tn in ["bg-noun", "bg-proper noun"]:
      g = None
      cur1 = getparam(t, "1")
      if cur1 in ["m", "f"]:
        g = cur1
      elif re.search("[a-zA-Z]", cur1):
        pagemsg("WARNING: Saw Latin in 1=%s in %s" % (cur1, origt))
        continue
      head = getparam(t, "head") or getparam(t, "sg")
      rmparam(t, "head")
      rmparam(t, "sg")
      genders = []
      def process_gender(g):
        if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]:
          genders.append(g)
        elif g in ["mf", "fm"]:
          genders.append("m")
          genders.append("f")
        elif g in ["mn", "nm"]:
          genders.append("m")
          genders.append("n")
        elif g in ["fn", "nf"]:
          genders.append("f")
          genders.append("n")
        elif g in ["mfn", "fmn", "mnf", "nmf", "fnm", "nfm"]:
          genders.append("m")
          genders.append("f")
          genders.append("n")
        else:
          pagemsg("WARNING: Unrecognized gender '%s'" % g)
      if g:
        process_gender(g)
        rmparam(t, "1")
      g = getparam(t, "2")
      if g:
        process_gender(g)
      g = getparam(t, "g")
      if g:
        process_gender(g)
      rmparam(t, "g")
      g2 = getparam(t, "g2")
      if g2:
        process_gender(g2)
      rmparam(t, "g2")
      g3 = getparam(t, "g3")
      if g3:
        process_gender(g3)
      rmparam(t, "g3")
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if not pval:
          continue
        params.append((pname, pval, showkey))
      # Erase all params.
      del t.params[:]
      # Put back new params.
      t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle))
      blib.set_param_chain(t, genders, "2", "g")
      for pname, pval, showkey in params:
        t.add(pname, pval, showkey=showkey, preserve_spacing=False)
      if origt != unicode(t):
        notes.append("move head=/sg= to 1=, g= to 2= in {{%s}}" % tn)
    if unicode(t) != origt:
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  return parsed, notes
Esempio n. 47
0
def process_page(page, index, parsed)::
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist")
    return

  if ":" in pagetitle and not re.search(
      "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle):
    pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page")
    return

  text = unicode(page.text)
  notes = []

  subsections = re.split("(^==.*==\n)", text, 0, re.M)
  newtext = text

  def move_param(t, fr, to, frob_from=None):
    if t.has(fr):
      oldval = getparam(t, fr)
      if not oldval.strip():
        rmparam(t, fr)
        pagemsg("Removing blank param %s" % fr)
        return
      if frob_from:
        newval = frob_from(oldval)
        if not newval or not newval.strip():
          return
      else:
        newval = oldval

      if getparam(t, to).strip():
          pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s"
              % (fr, to, to, unicode(t)))
      elif oldval != newval:
        rmparam(t, to) # in case of blank param
        # If either old or new name is a number, use remove/add to automatically set the
        # showkey value properly; else it's safe to just change the name of the param,
        # which will preserve its location.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          tfr = t.get(fr)
          tfr.name = to
          tfr.value = newval
        pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to,
          newval.replace("\n", r"\n")))
      else:
        rmparam(t, to) # in case of blank param
        # See comment above.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          t.get(fr).name = to
        pagemsg("%s -> %s" % (fr, to))

  def fix_page_params(t):
    origt = unicode(t)
    for param in ["page", "pages"]:
      pageval = getparam(t, param)
      if re.search(r"^\s*pp?\.\s*", pageval):
        pageval = re.sub(r"^(\s*)pp?\.\s*", r"\1", pageval)
        t.add(param, pageval)
        notes.append("remove p(p). from %s=" % param)
        pagemsg("remove p(p). from %s=" % param)
    if re.search(r"^[0-9]+$", getparam(t, "pages").strip()):
      move_param(t, "pages", "page")
    if re.search(r"^[0-9]+[-–—]$", getparam(t, "page").strip()):
      move_param(t, "page", "pages")
    return origt != unicode(t)

  def fix_cite_book_params(t):
    origt = unicode(t)
    if getparam(t, "origyear").strip() and getparam(t, "year").strip():
      if getparam(t, "year_published"):
        pagemsg("WARNING: Would set year_published= but is already present: %s"
            % unicode(t))
      else:
        rmparam(t, "year_published") # in case of blank param
        t.get("year").name = "year_published"
        t.get("origyear").name = "year"
        pagemsg("year -> year_published, origyear -> year")
    move_param(t, "origdate", "date")
    move_param(t, "origmonth", "month")
    def frob_isbn(idval):
      isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)"
      if re.search(isbn_re, idval, re.I):
        return re.sub(isbn_re, r"\1", idval, 0, re.I)
      elif re.search(r"^[0-9]", idval.strip()):
        return idval
      else:
        pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" %
            idval.replace("\n", r"\n"))
        return None
    move_param(t, "id", "isbn", frob_isbn)
    fix_page_params(t)
    return origt != unicode(t)

  def fix_cite_usenet_params(t):
    origt = unicode(t)
    move_param(t, "group", "newsgroup")
    move_param(t, "link", "url")
    return origt != unicode(t)

  def fix_quote_usenet_params(t):
    origt = unicode(t)
    monthday = getparam(t, "monthday").strip()
    year = getparam(t, "year").strip()
    if monthday and year:
      if getparam(t, "date"):
        pagemsg("WARNING: Would set date= but is already present: %s"
            % unicode(t))
      else:
        rmparam(t, "date") # in case of blank param
        param = t.get("monthday")
        param.name = "date"
        if re.search("^[0-9]+/[0-9]+$", monthday):
          param.value = "%s/%s" % (monthday, year)
        else:
          param.value = "%s %s" % (monthday, year)
        rmparam(t, "year")
        pagemsg("monthday/year -> date")
    move_param(t, "group", "newsgroup")
    move_param(t, "text", "passage")
    move_param(t, "6", "passage")
    move_param(t, "5", "url")
    move_param(t, "4", "newsgroup")
    move_param(t, "3", "title")
    move_param(t, "2", "author")
    move_param(t, "1", "date")
    return origt != unicode(t)

  def replace_in_reference(parsed, in_what):
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      origt = unicode(t)
      if tname.strip() in ["reference-journal", "reference-news"]:
        set_template_name(t, "cite-journal", tname)
        pagemsg("%s -> cite-journal" % tname.strip())
        notes.append("%s -> cite-journal" % tname.strip())
        fix_page_params(t)
        pagemsg("Replacing %s with %s in %s" %
            (origt, unicode(t), in_what))
      if tname.strip() == "reference-book":
        set_template_name(t, "cite-book", tname)
        pagemsg("reference-book -> cite-book")
        fixed_params = fix_cite_book_params(t)
        notes.append("reference-book -> cite-book%s" % (
          fixed_params and " and fix book cite params" or ""))
        pagemsg("Replacing %s with %s in %s" %
            (origt, unicode(t), in_what))

  for j in xrange(0, len(subsections), 2):
    parsed = blib.parse_text(subsections[j])
    if j > 0 and re.search(r"^===*References===*\n", subsections[j-1]):
      replace_in_reference(parsed, "==References== section")
      subsections[j] = unicode(parsed)
    else:
      for t in parsed.filter_tags():
        if unicode(t.tag) == "ref":
          tagparsed = mw.wikicode.Wikicode([t])
          replace_in_reference(tagparsed, "<ref>")
          subsections[j] = unicode(parsed)
    need_to_replace_double_quote_prefixes = False
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      origt = unicode(t)
      for fr, to in simple_replace:
        if tname.strip() == fr:
          set_template_name(t, to, tname)
          pagemsg("%s -> %s" % (fr, to))
          notes.append("%s -> %s" % (fr, to))
          fix_page_params(t)
          pagemsg("Replacing %s with %s" % (origt, unicode(t)))
      if tname.strip() in ["reference-journal", "reference-news"]:
        set_template_name(t, "quote-journal", tname)
        pagemsg("%s -> quote-journal" % tname.strip())
        notes.append("%s -> quote-journal" % tname.strip())
        fix_page_params(t)
        pagemsg("Replacing %s with %s outside of reference section" %
            (origt, unicode(t)))
      if tname.strip() == "reference-book":
        set_template_name(t, "quote-book", tname)
        pagemsg("reference-book -> cite-book")
        fixed_params = fix_cite_book_params(t)
        notes.append("reference-book -> cite-book%s" % (
          fixed_params and " and fix book cite params" or ""))
        pagemsg("Replacing %s with %s outside of reference section" %
            (origt, unicode(t)))
      if tname.strip() in ["cite-usenet", "quote-usenet"]:
        if tname.strip() == "cite-usenet":
          fixed_params = fix_cite_usenet_params(t)
        else:
          fixed_params = fix_quote_usenet_params(t)
        set_template_name(t, "quote-newsgroup", tname)
        pagemsg("%s -> quote-newsgroup" % tname.strip())
        prefix = getparam(t, "prefix").strip()
        removed_prefix = False
        if prefix:
          if prefix in ["#", "#*"]:
            parsed.insert_before(t, "#* ")
            rmparam(t, "prefix")
            pagemsg("remove prefix=%s, insert #* before template" % prefix)
            need_to_replace_double_quote_prefixes = True
            removed_prefix = True
          else:
            pagemsg("WARNING: Found prefix=%s, not # or #*: %s" %
                (prefix, unicode(t)))
        notes.append("%s -> quote-newsgroup%s%s" % (tname.strip(),
          removed_prefix and
            ", remove prefix=%s, insert #* before template" % prefix or "",
          fixed_params and ", fix params" or ""))
        pagemsg("Replacing %s with %s" % (origt, unicode(t)))
    subsections[j] = unicode(parsed)
    if need_to_replace_double_quote_prefixes:
      newval = re.sub("^#\* #\* ", "#* ", subsections[j], 0, re.M)
      if newval != subsections[j]:
        notes.append("remove double #* prefix")
        pagemsg("Removed double #* prefix")
      subsections[j] = newval

  return "".join(subsections), notes
Esempio n. 48
0
def getrmparam(t, param):
    value = getparam(t, param)
    rmparam(t, param)
    return value
Esempio n. 49
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  for t in parsed.filter_templates():
    if tname(t) == "bg-noun-form":
      origt = unicode(t)
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "head"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      rmparam(t, "1")
      rmparam(t, "2")
      head = getparam(t, "head")
      rmparam(t, "head")
      g = getparam(t, "3")
      rmparam(t, "3")
      blib.set_template_name(t, "head")
      t.add("1", "bg")
      t.add("2", "noun form")
      if head:
        t.add("head", head)
      else:
        if bglib.needs_accents(pagetitle):
          pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" %
              unicode(t))
        else:
          t.add("head", pagetitle)
      if g:
        t.add("g", g)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}")

  headt = None
  saw_infl_after_head = False
  saw_headt = False
  saw_inflt = False
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    saw_infl = False
    already_fetched_forms = False
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form":
      saw_headt = True
      if headt and not saw_infl_after_head:
        pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % (
          unicode(headt), origt))
      saw_infl_after_head = False
      headt = t
    if tn == "bg-noun form of":
      saw_inflt = True
      if not headt:
        pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt)
        continue
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "noun"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      saw_infl_after_head = True
      noun = getparam(t, "noun")
      if not noun:
        pagemsg("WARNING: Didn't see noun=: %s" % origt)
        continue
      infls = []
      param2 = getparam(t, "2")
      if param2 == "indefinite":
        infls.append("indef")
      elif param2 == "definite":
        infls.append("def")
      elif param2 == "vocative":
        infls.append("voc")
      elif param2:
        pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt))
        continue
      param3 = getparam(t, "3")
      if param3 == "subject":
        infls.append("sbjv")
      elif param3 == "object":
        infls.append("objv")
      elif param3:
        pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt))
        continue
      param1 = getparam(t, "1")
      if param1 == "singular":
        infls.append("s")
      elif param1 == "plural":
        infls.append("p")
      elif param1 == "count":
        infls.extend(["count", "form"])
      elif param1 == "vocative":
        infls.extend(["voc", "s"])
      else:
        pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt))
        continue
      blib.set_template_name(t, "inflection of")
      del t.params[:]
      t.add("1", "bg")
      lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
      if not lemma:
        pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
        t.add("2", noun)
      else:
        t.add("2", lemma)
      t.add("3", "")
      for i, infl in enumerate(infls):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{bg-noun form of}} to {{inflection of}}")
      tn = tname(t)
      saw_infls = infls_to_slot(infls)
      already_fetched_forms = True
      if not saw_infls:
        pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "inflection of" and getparam(t, "1") == "bg":
      saw_inflt = True
      infls = []
      i = 4
      while True:
        infl = getparam(t, str(i))
        if not infl:
          break
        infls.append(infl)
        i += 1
      saw_infls = infls_to_slot(infls)
      if not saw_infls:
        if "vnoun" in infls:
          pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt))
        elif "part" in infls:
          pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt))
        else:
          pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "definite singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_sg"
    elif tn == "indefinite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "ind_pl"
    elif tn == "definite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_pl"
    elif tn == "vocative singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "voc_sg"
    if saw_infl:
      if not already_fetched_forms:
        noun = getparam(t, "2")
        lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
        if not lemma:
          pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
          continue
        t.add("2", lemma)
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))
        notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn))
      if saw_infl == "def_sg":
        def_sub_sg = forms.get("def_sub_sg", None)
        def_obj_sg = forms.get("def_obj_sg", None)
        if def_sub_sg != def_obj_sg:
          pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % (
            def_sub_sg, def_obj_sg))
          continue
        form = def_sub_sg
      else:
        form = forms.get(saw_infl, None)
      if not form:
        pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" %
            (saw_infl, format_forms(forms)))
        continue
      form = form.split(",")
      filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle]
      if not filtered_form:
        pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form)))
        continue
      form = filtered_form
      existing_form = blib.fetch_param_chain(headt, "head", "head")
      if existing_form:
        must_continue = False
        for f in existing_form:
          if bglib.remove_accents(f) != pagetitle:
            pagemsg("WARNING: Existing head %s doesn't match page title: %s" % (
              f, unicode(headt)))
            must_continue = True
            break
        if must_continue:
          continue
        needs_accents = [bglib.needs_accents(f) for f in existing_form]
        if any(needs_accents) and not all(needs_accents):
          pagemsg("WARNING: Some but not all existing heads missing accents: %s" %
              unicode(headt))
          continue
        if not any(needs_accents):
          if existing_form != form:
            pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % (
              saw_infl, ",".join(existing_form), ",".join(form)))
          continue
      origheadt = unicode(headt)
      blib.set_param_chain(headt, form, "head", "head")
      pagemsg("Replaced %s with %s" % (origheadt, unicode(headt)))
      notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form)))

  if saw_headt and not saw_inflt:
    pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt))

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    if tn in template_to_infl_codes and getparam(t, "1") == "bg":
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      infl_codes = template_to_infl_codes[tn]
      blib.set_template_name(t, "inflection of")
      t.add("3", "")
      for i, infl in enumerate(infl_codes):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{%s}} to {{inflection of}}" % tn)

  return unicode(parsed), notes
Esempio n. 50
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping page")
        return

    text = unicode(page.text)
    notes = []

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(
                t, "2") == "noun form":
            if getparam(t, "3"):
                pagemsg("WARNING: Found param 3 in {{head|ru|noun form}}: %s" %
                        unicode(t))
                return
            rmparam(t, "1")
            rmparam(t, "2")
            head = getrmparam(t, "head")
            head2 = getrmparam(t, "head2")
            tr = getrmparam(t, "tr")
            tr2 = getrmparam(t, "tr2")
            g = getrmparam(t, "g")
            g2 = getrmparam(t, "g2")
            g3 = getrmparam(t, "g3")
            if len(t.params) > 0:
                pagemsg("WARNING: Extra params in noun form template: %s" %
                        unicode(t))
                return
            t.name = "ru-noun form"
            if head or g:
                t.add("1", head)
            if head2:
                t.add("head2", head2)
            if g:
                t.add("2", g)
            if g2:
                t.add("g2", g2)
            if g3:
                t.add("g3", g3)
            if tr:
                t.add("tr", tr)
            if tr2:
                t.add("tr2", tr2)
            newt = unicode(t)
            if origt != newt:
                pagemsg("Replaced %s with %s" % (origt, newt))
                notes.append(
                    "convert {{head|ru|noun form}} to {{ru-noun form}}")
        elif unicode(t.name) == "ru-noun form":
            if getparam(t, "head") and getparam(t, "1"):
                pagemsg(
                    "WARNING: ru-noun form has both params 1= and head=: %s" %
                    unicode(t))
                return
            if getparam(t, "g") and getparam(t, "2"):
                pagemsg("WARNING: ru-noun form has both params 2= and g=: %s" %
                        unicode(t))
                return
            head = getrmparam(t, "1") or getrmparam(t, "head")
            head2 = getrmparam(t, "head2")
            tr = getrmparam(t, "tr")
            tr2 = getrmparam(t, "tr2")
            g = getrmparam(t, "2") or getrmparam(t, "g")
            g2 = getrmparam(t, "g2")
            g3 = getrmparam(t, "g3")
            if len(t.params) > 0:
                pagemsg("WARNING: Extra params in noun form template: %s" %
                        unicode(t))
                return
            if head or g:
                t.add("1", head)
            if head2:
                t.add("head2", head2)
            if g:
                t.add("2", g)
            if g2:
                t.add("g2", g2)
            if g3:
                t.add("g3", g3)
            if tr:
                t.add("tr", tr)
            if tr2:
                t.add("tr2", tr2)
            newt = unicode(t)
            if origt != newt:
                pagemsg("Replaced %s with %s" % (origt, newt))
                notes.append("canonicalize ru-noun form")

    return unicode(parsed), notes
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []

  for t in parsed.filter_templates():
    origt = unicode(t)
    if tname(t) == "prefixusex":
      if getparam(t, "1").endswith("-") or getparam(t, "2").endswith("-"):
        pagemsg("WARNING: Has prefix as term: %s" % origt)
    if tname(t) == "suffixusex":
      if getparam(t, "1").startswith("-") or getparam(t, "2").startswith("-"):
        pagemsg("WARNING: Has suffix as term: %s" % origt)
    if tname(t) in ["prefixusex", "suffixusex"]:
      if getparam(t, "lang"):
        pagemsg("WARNING: Uses lang= param: %s" % origt)
        lang = getparam(t, "lang")
        term1 = getparam(t, "1")
        term2 = getparam(t, "2")
        altsuf = getparam(t, "altsuf")
        altpref = getparam(t, "altpref")
        t1 = getparam(t, "t1") or getparam(t, "gloss1")
        t2 = getparam(t, "t2") or getparam(t, "gloss2")
        alt1 = getparam(t, "alt1")
        alt2 = getparam(t, "alt2")
        pos1 = getparam(t, "pos1")
        pos2 = getparam(t, "pos2")
        # Fetch remaining non-numbered params.
        non_numbered_params = []
        for param in t.params:
          pname = unicode(param.name)
          if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "t1", "gloss1", "t2", "gloss2",
              "alt1", "alt2", "pos1", "pos2", "altpref", "altsuf"]:
            non_numbered_params.append((pname, param.value))
        # Erase all params.
        del t.params[:]
        # Put back params in proper order, then the remaining non-numbered params.
        t.add("1", lang)
        if altpref:
          t.add("altpref", altpref)
        if term1:
          t.add("2", term1)
        if alt1:
          t.add("alt1", alt1)
        if pos1:
          t.add("pos1", pos1)
        if t1:
          t.add("t1", t1)
        if altsuf:
          t.add("altsuf", altsuf)
        if term2:
          t.add("3", term2)
        if alt2:
          t.add("alt2", alt2)
        if pos2:
          t.add("pos2", pos2)
        if t2:
          t.add("t2", t2)
        for name, value in non_numbered_params:
          t.add(name, value)
        notes.append("Move lang= to 1= in prefixusex/suffixusex")
        if getparam(t, "inline"):
          rmparam(t, "inline")
          notes.append("Remove inline= in prefixusex/suffixusex")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return unicode(parsed), notes
Esempio n. 52
0
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      pagemsg("Found ru-noun+ or ru-proper noun+, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, []

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  frobbed_manual_translit = []
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  # Retrieve headword translit and maybe transfer to decl
  headword_tr = getparam(headword_template, "tr")
  if headword_tr:
    if verbose:
      pagemsg("Found headword manual translit tr=%s" % headword_tr)
    if "," in headword_tr:
      pagemsg("WARNING: Comma in headword manual translit, skipping: %s" %
          headword_tr)
      return None
    # Punt if multi-arg-set, can't handle yet
    for decl_template in decl_templates:
      for param in decl_template.params:
        if not param.showkey:
          val = unicode(param.value)
          if val == "or":
            pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
          if val == "-" or val == "_" or val.startswith("join:"):
            pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
      for i in xrange(2, 10):
        if getparam(headword_template, "tr%s" % i):
          pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % (
            i, unicode(headword_template)))
          return None
      if runoun.arg1_is_stress(getparam(decl_template, "1")):
        lemma_arg = "2"
      else:
        lemma_arg = "1"
      lemmaval = getparam(decl_template, lemma_arg)
      if not lemmaval:
        lemmaval = subpagetitle
      if "//" in lemmaval:
        m = re.search("^(.*?)//(.*)$", lemmaval)
        if m.group(2) != headword_tr:
          pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % (
            lemmaval, headword_tr))
          return None
        else:
          pagemsg("Already found manual translit in decl template %s" %
              lemmaval)
      else:
        lemmaval += "//" + headword_tr
        orig_decl_template = unicode(decl_template)
        decl_template.add(lemma_arg, lemmaval)
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))
        frobbed_manual_translit = [headword_tr]

  genders = blib.fetch_param_chain(headword_template, "2", "g")

  bian_replaced = 0

  # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in
  # headword template
  for decl_template in decl_templates:
    if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]:
      saw_in = -1
      saw_an = -1
      for i,g in enumerate(genders):
        if re.search(r"\bin\b", g) and saw_in < 0:
          saw_in = i
        if re.search(r"\ban\b", g) and saw_an < 0:
          saw_an = i
      if saw_in >= 0 and saw_an >= 0:
        orig_decl_template = unicode(decl_template)
        if saw_in < saw_an:
          pagemsg("Replacing a=bi with a=ia in decl template")
          decl_template.add("a", "ia")
          bian_replaced = 1
        else:
          pagemsg("Replacing a=bi with a=ai in decl template")
          decl_template.add("a", "ai")
          bian_replaced = 1
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))

  generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
      unicode(noun_table_template))
  generate_result = expand_text(generate_template)
  if not generate_result:
    pagemsg("WARNING: Error generating noun args, skipping")
    return None
  args = ru.split_generate_args(generate_result)

  genders = runoun.check_old_noun_headword_forms(headword_template, args,
      subpagetitle, pagemsg)
  if genders == None:
    return None

  new_params = []
  for param in noun_table_template.params:
    new_params.append((param.name, param.value))

  orig_headword_template = unicode(headword_template)
  params_to_preserve = runoun.fix_old_headword_params(headword_template,
      new_params, genders, pagemsg)
  if params_to_preserve == None:
    return None

  if unicode(headword_template.name) == "ru-proper noun":
    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(headword_template, "n"):
      pagemsg("Adding n=both to headword tempate")
      headword_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        else:
          pagemsg("Removing n=sg from headword tempate")
          rmparam(headword_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  headword_template.params.extend(params_to_preserve)
  ru_noun_changed = 0
  ru_proper_noun_changed = 0
  if unicode(headword_template.name) == "ru-noun":
    headword_template.name = "ru-noun+"
    ru_noun_changed = 1
  else:
    headword_template.name = "ru-proper noun+"
    ru_proper_noun_changed = 1

  pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template)))

  return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit
Esempio n. 53
0
def rewrite_one_page_arz_headword(page, index, text):
  temps_changed = []
  for t in text.filter_templates():
    if unicode(t.name) == "arz-noun":
      head = getparam(t, "head")
      rmparam(t, "head")
      tr = getparam(t, "tr")
      rmparam(t, "tr")
      sort = getparam(t, "sort")
      rmparam(t, "sort")
      g = getparam(t, "g")
      rmparam(t, "g")
      g2 = getparam(t, "g2")
      rmparam(t, "g2")
      pl = getparam(t, "2")
      rmparam(t, "2")
      pltr = getparam(t, "3")
      rmparam(t, "3")
      addparam(t, "1", head)
      addparam(t, "2", g)
      if g2:
        addparam(t, "g2", g2)
      if tr:
        addparam(t, "tr", tr)
      if pl:
        addparam(t, "pl", pl)
      if pltr:
        addparam(t, "pltr", pltr)
      if sort:
        addparam(t, "sort", sort)
      temps_changed.append("arz-noun")
    elif unicode(t.name) == "arz-adj":
      head = getparam(t, "head")
      rmparam(t, "head")
      tr = getparam(t, "tr")
      rmparam(t, "tr")
      sort = getparam(t, "sort")
      rmparam(t, "sort")
      pl = getparam(t, "pwv") or getparam(t, "p")
      rmparam(t, "pwv")
      rmparam(t, "p")
      pltr = getparam(t, "ptr")
      rmparam(t, "ptr")
      f = getparam(t, "fwv") or getparam(t, "f")
      rmparam(t, "fwv")
      rmparam(t, "f")
      ftr = getparam(t, "ftr")
      rmparam(t, "ftr")
      addparam(t, "1", head)
      if tr:
        addparam(t, "tr", tr)
      if f:
        addparam(t, "f", f)
      if ftr:
        addparam(t, "ftr", ftr)
      if pl:
        addparam(t, "pl", pl)
      if pltr:
        addparam(t, "pltr", pltr)
      if sort:
        addparam(t, "sort", sort)
      temps_changed.append("arz-adj")
  return text, "rewrite %s to new style" % ", ".join(temps_changed)
Esempio n. 54
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("WARNING: Script no longer applies and would need fixing up")
    return

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        tname = unicode(t.name)
        if tname.startswith("ru-conj-") and tname != "ru-conj-verb-see":
            m = re.search("^ru-conj-(.*)$", tname)
            t.name = "ru-conj"
            conjtype = m.group(1)
            varargno = None
            variant = None
            if conjtype in [
                    "3oa", "4a", "4b", "4c", "6a", "6c", "11a", "16a", "16b",
                    u"irreg-дать", u"irreg-клясть", u"irreg-быть"
            ]:
                varargno = 3
            elif conjtype in [
                    "5a", "5b", "5c", "6b", "9a", "9b", "11b", "14a", "14b",
                    "14c"
            ]:
                varargno = 4
            elif conjtype in ["7b"]:
                varargno = 5
            elif conjtype in ["7a"]:
                varargno = 6
            if varargno:
                variant = getparam(t, str(varargno))
                if re.search("^[abc]", variant):
                    variant = "/" + variant
                if getparam(t, str(varargno + 1)) or getparam(
                        t, str(varargno + 2)) or getparam(
                            t, str(varargno + 3)):
                    t.add(str(varargno), "")
                else:
                    rmparam(t, str(varargno))
                conjtype = conjtype + variant
            notes.append("ru-conj-* -> ru-conj, moving params up by one%s" %
                         (variant and " (and move variant spec)" or ""))
            seenval = False
            for i in xrange(20, 0, -1):
                val = getparam(t, str(i))
                if val:
                    seenval = True
                if seenval:
                    t.add(str(i + 1), val)
            t.add("1", conjtype)
            blib.sort_params(t)
        newt = unicode(t)
        if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
Esempio n. 55
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  parsed = blib.parse(page)

  headword_template = None
  see_template = None
  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      if headword_template:
        pagemsg("WARNING: Multiple headword templates, skipping")
        return
      headword_template = t
    if unicode(t.name) in ["ru-decl-noun-see"]:
      if see_template:
        pagemsg("WARNING: Multiple ru-decl-noun-see templates, skipping")
        return
      see_template = t
  if not headword_template:
    pagemsg("WARNING: No ru-noun+ or ru-proper noun+ templates, skipping")
    return
  if not see_template:
    pagemsg("WARNING: No ru-decl-noun-see templates, skipping")
    return

  del see_template.params[:]
  for param in headword_template.params:
    see_template.add(param.name, param.value)
  see_template.name = "ru-noun-table"

  if unicode(headword_template.name) == "ru-proper noun+":
    # Things are trickier for proper nouns because they default to n=sg, whereas
    # ru-noun-table defaults to n=both. We have to expand both templates and
    # fetch the value of n, and set it in ru-noun-table if not the same.

    # 1. Generate args for headword proper-noun template, using |ndef=sg
    #    because ru-proper noun+ defaults to sg and ru-generate-noun-args
    #    would otherwise default to both.
    headword_generate_template = re.sub(r"^\{\{ru-proper noun\+", "{{ru-generate-noun-args",
        unicode(headword_template))
    headword_generate_template = re.sub(r"\}\}$", "|ndef=sg}}", headword_generate_template)
    headword_generate_result = expand_text(headword_generate_template)
    if not headword_generate_result:
      pagemsg("WARNING: Error generating ru-proper noun+ args")
      return None
    # 2. Fetch actual value of n.
    headword_args = ru.split_generate_args(headword_generate_result)
    headword_n = headword_args["n"]
    # 3. If sg, we always need to set n=sg explicitly in ru-noun-table.
    if headword_n == "s":
      see_template.add("n", "sg")
    # 4. If pl, leave alone, since both will default to plural only if the
    #    lemma is pl, else n=pl needs to be set for both.
    elif headword_n == "p":
      pass
    # 5. If both, n=both had to have been set explicitly in the headword,
    #    but it's the default in ru-noun-table unless the lemma is plural.
    #    So remove n=both, generate the arguments, and see if the actual
    #    value of args.n is b (for "both"); if not, set n=both.
    else:
      assert headword_n == "b"
      rmparam(see_template, "n")
      see_generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
          unicode(see_template))
      see_generate_result = expand_text(see_generate_template)
      if not see_generate_result:
        pagemsg("WARNING: Error generating ru-noun-table args")
        return None
      see_args = ru.split_generate_args(see_generate_result)
      if see_args["n"] != "b":
        see_template.add("n", "both")

  comment = "Replace ru-decl-noun-see with ru-noun-table, taken from headword template (%s)" % unicode(headword_template.name)
  if save:
    pagemsg("Saving with comment = %s" % comment)
    page.text = unicode(parsed)
    page.save(comment=comment)
  else:
    pagemsg("Would save with comment = %s" % comment)
Esempio n. 56
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Try to canonicalize existing 'conjugation of'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "conjugation of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          t.name = "inflection of"
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("converted 'conjugation of' to 'inflection of'")
      sections[j] = unicode(parsed)

      # Try to split 'inflection of' containing 'present or future' into two
      # defns
      newsec = re.sub(r"^# \{\{inflection of\|(.*?)\|present or future\|(.*?)\}\}$",
          r"# {{inflection of|\1|pres|\2}}\n# {{inflection of|\1|fut|\2}}",
          sections[j], 0, re.M)
      if newsec != sections[j]:
        notes.append("split 'present or future' form code into two defns with 'pres' and 'fut'")
        sections[j] = newsec

      # Convert 'indc' to 'ind', 'futr' to 'fut', 'perfective' and
      # '(perfective)' to 'pfv', 'imperfective' and '(imperfective)' to 'impfv',
      # 'impr' to 'imp'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          for frm, to in [
              ("indc", "ind"), ("indicative", "ind"),
              ("futr", "fut"), ("future", "fut"),
              ("impr", "imp"), ("imperative", "imp"),
              ("perfective", "pfv"), ("(perfective)", "pfv"),
              ("imperfective", "impfv"), ("(imperfective)", "impfv"),
              ("singular", "s"), ("(singular)", "s"),
              ("plural", "p"), ("(plural)", "p"),
              ("masculine", "m"), ("(masculine)", "m"),
              ("feminine", "f"), ("(feminine)", "f"),
              ("neuter", "n"), ("(neuter)", "n"), ("neutral", "n"), ("(neutral)", "n"),
              ]:
            origt = unicode(t)
            for i in xrange(3,20):
              val = getparam(t, str(i))
              if val == frm:
                t.add(str(i), to)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("converted '%s' form code to '%s'" % (frm, to))
      sections[j] = unicode(parsed)

      # Remove blank form codes and canonicalize position of lang=, tr=
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          # Fetch the numbered params starting with 3, skipping blank ones
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          # Fetch param 1 and param 2, and non-numbered params except lang=
          # and nocat=.
          param1 = getparam(t, "1")
          param2 = getparam(t, "2")
          tr = getparam(t, "tr")
          nocat = getparam(t, "nocat")
          non_numbered_params = []
          for param in t.params:
            pname = unicode(param.name)
            if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]:
              non_numbered_params.append((pname, param.value))
          # Erase all params.
          del t.params[:]
          # Put back lang, param 1, param 2, tr, then the replacements for the
          # higher numbered params, then the non-numbered params.
          t.add("lang", "ru")
          t.add("1", param1)
          t.add("2", param2)
          if tr:
            t.add("tr", tr)
          for i, param in enumerate(numbered_params):
            t.add(str(i+3), param)
          for name, value in non_numbered_params:
            t.add(name, value)
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("removed any blank form codes and maybe rearranged lang=, tr=")
            if nocat:
              notes.append("removed nocat=")
      sections[j] = unicode(parsed)

      # Try to canonicalize 'inflection of' involving the imperative,
      # present, future
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          # Fetch the numbered params starting with 3
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          while len(numbered_params) > 0 and not numbered_params[-1]:
            del numbered_params[-1]
          # Now canonicalize
          numparamstr = "/".join(numbered_params)
          numparamset = set(numbered_params)
          canon_params = []
          while True:
            if numparamset == {'s', 'pfv', 'imp'}:
              canon_params = ['2', 's', 'pfv', 'imp']
            elif numparamset == {'s', 'impfv', 'imp'}:
              canon_params = ['2', 's', 'impfv', 'imp']
            elif numparamset == {'s', 'imp'}:
              canon_params = ['2', 's', 'imp']
            elif numparamset == {'p', 'pfv', 'imp'}:
              canon_params = ['2', 'p', 'pfv', 'imp']
            elif numparamset == {'p', 'impfv', 'imp'}:
              canon_params = ['2', 'p', 'impfv', 'imp']
            elif numparamset == {'p', 'imp'}:
              canon_params = ['2', 'p', 'imp']
            elif numparamset == {'m', 's', 'past'}:
              canon_params = ['m', 's', 'past', 'ind']
            elif numparamset == {'f', 's', 'past'}:
              canon_params = ['f', 's', 'past', 'ind']
            elif numparamset == {'n', 's', 'past'}:
              canon_params = ['n', 's', 'past', 'ind']
            elif numparamset == {'p', 'past'}:
              canon_params = ['p', 'past', 'ind']
            else:
              m = re.search(r"^([123])/([sp])/(pres|fut)$", numparamstr)
              if m:
                canon_params = [m.group(1), m.group(2), m.group(3), "ind"]
            break
          if canon_params:
            origt = unicode(t)
            # Fetch param 1 and param 2. Erase all numbered params.
            # Put back param 1 and param 2 (this will put them after lang=ru),
            # then the replacements for the higher params.
            param1 = getparam(t, "1")
            param2 = getparam(t, "2")
            for i in xrange(19,0,-1):
              rmparam(t, str(i))
            t.add("1", param1)
            t.add("2", param2)
            for i, param in enumerate(canon_params):
              t.add(str(i+3), param)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params))
            else:
              pagemsg("Apparently already canonicalized: %s" % newt)
      sections[j] = unicode(parsed)

      # Try to add 'inflection of' to raw-specified participial inflection
      def add_participle_inflection_of(m):
        prefix = m.group(1)
        tense = m.group(2).lower()
        if tense == "present":
          tense = "pres"
        voice = m.group(3).lower()
        if voice == "active":
          voice = "act"
        elif voice == "passive":
          voice = "pass"
        elif voice == "adverbial":
          voice = "adv"
        lemma = m.group(4)
        retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|part}}" % (lemma, tense, voice)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s/%s/part" % (tense, voice))
        return retval
      newsec = re.sub(r"(# |\()'*(present|past) participle (active|passive|adverbial) of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*(present|past) (active|passive|adverbial) participle of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified past inflection
      def add_past_inflection_of(m):
        prefix = m.group(1)
        gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f",
            "neuter":"n", "neutral":"n", "plural":"p"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||%s%s|past|ind}}" % (lemma, gender, gender != "p" and "|s" or "")
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s%s/past/ind" % (gender, gender != "p" and "/s" or ""))
        return retval
      newsec = re.sub(r"(# |\()'*(male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)past (?:tense |form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*past(?:-tense| tense|) (male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)(?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified imperative inflection
      def add_imper_inflection_of(m):
        prefix = m.group(1)
        number = {"singular":"s", "plural":"p"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||2|%s|imp}}" % (lemma, number)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for 2/%s/imp" % number)
        return retval
      newsec = re.sub(r"(# |\()'*(singular|plural) imperative (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*imperative (singular|plural) (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified finite pres/fut inflection
      def add_pres_fut_inflection_of(m):
        prefix = m.group(1)
        person = m.group(2)[0]
        number = {"singular":"s", "plural":"p"}[m.group(3).lower()]
        tense = {"present":"pres", "future":"fut"}[m.group(4).lower()]
        lemma = m.group(5)
        retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|%s|ind}}" % (lemma, person, number, tense)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s/%s/%s/ind" % (person, number, tense))
        return retval
      newsec = re.sub(r"(# |\()'*(1st|2nd|3rd)(?:-person| person|) (singular|plural) (present|future) (?:tense |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_pres_fut_inflection_of,
          sections[j], 0, re.I)
      sections[j] = newsec

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 57
0
def process_page(index, page, save, verbose, romaji_to_keep):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]:
      origt = unicode(t)

      # Remove old script code
      p1 = getparam(t, "1")
      if p1 in ["r", "h", "ka", "k", "s", "ky", "kk"]:
        pagemsg("Removing 1=%s: %s" % (p1, unicode(t)))
        notes.append("remove 1=%s from %s" % (p1, tname))
        rmparam(t, "1")
        for param in t.params:
          pname = unicode(param.name)
          if re.search(r"^[0-9]+$", pname):
            param.name = str(int(pname) - 1)
            param.showkey = False

      # Convert hira= and/or kata= to numbered param. The complexity is
      # from ensuring that the numbered params always go before the
      # non-numbered ones.
      if t.has("hira") or t.has("kata"):
        # Fetch the numbered and non-numbered params, skipping blank
        # numbered ones and converting hira and kata to numbered
        numbered_params = []
        non_numbered_params = []
        for param in t.params:
          pname = unicode(param.name)
          if re.search(r"^[0-9]+$", pname):
            val = unicode(param.value)
            if val:
              numbered_params.append(val)
          elif pname not in ["hira", "kata"]:
            non_numbered_params.append((pname, param.value))
        hira = getparam(t, "hira")
        if hira:
          numbered_params.append(hira)
          pagemsg("Moving hira=%s to %s=: %s" % (hira, len(numbered_params),
            unicode(t)))
          notes.append("move hira= to %s= in %s" % (len(numbered_params),
            tname))
        kata = getparam(t, "kata")
        if kata:
          numbered_params.append(kata)
          pagemsg("Moving kata=%s to %s=: %s" % (kata, len(numbered_params),
            unicode(t)))
          notes.append("move kata= to %s= in %s" % (len(numbered_params),
            tname))
        del t.params[:]
        # Put back numbered params, then non-numbered params.
        for i, param in enumerate(numbered_params):
          t.add(str(i+1), param)
        for name, value in non_numbered_params:
          t.add(name, value)

      # Remove rom= if not in list of pages to keep rom=
      if t.has("rom"):
        if pagetitle in romaji_to_keep:
          pagemsg("Keeping rom=%s because in romaji_to_keep: %s" % (
            getparam(t, "rom"), unicode(t)))
        else:
          pagemsg("Removing rom=%s: %s" % (getparam(t, "rom"), unicode(t)))
          rmparam(t, "rom")
          notes.append("remove rom= from %s" % tname)

      # Remove hidx=
      if t.has("hidx"):
        pagemsg("Removing hidx=%s: %s" % (getparam(t, "hidx"), unicode(t)))
        rmparam(t, "hidx")
        notes.append("remove hidx= from %s" % tname)

      newt = unicode(t)
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 58
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  text = unicode(page.text)

  notes = []
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    name = unicode(t.name)
    if unicode(t.name) == "fr-adj":
      g = getparam(t, "1")
      if g and g != "mf":
        pagemsg("WARNING: Strange value 1=%s, removing: %s" % (g, unicode(t)))
        rmparam(t, "1")
        notes.append("remove bogus 1=%s" % g)
        g = None
      inv = getparam(t, "inv")
      if inv:
        if inv not in ["y", "yes", "1"]:
          pagemsg("WARNING: Strange value inv=%s: %s" % (inv, unicode(t)))
        if (getparam(t, "1") or getparam(t, "f") or
            getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")):
          pagemsg("WARNING: Found extraneous params with inv=: %s" %
              unicode(t))
        continue
      if (getparam(t, "f2") or getparam(t, "mp2") or getparam(t, "fp2")
          or getparam(t, "p2")):
        pagemsg("Skipping multiple feminines or plurals: %s" % unicode(t))
        continue
      expected_mp = (pagetitle if re.search("[sx]$", pagetitle)
          else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al")
          else pagetitle + "s")
      if getparam(t, "mp") == expected_mp:
        rmparam(t, "mp")
        notes.append("remove redundant mp=")
      expected_fem = (pagetitle if pagetitle.endswith("e")
          else pagetitle + "ne" if pagetitle.endswith("en")
          else re.sub("er$", u"ère", pagetitle) if pagetitle.endswith("er")
          else pagetitle + "le" if pagetitle.endswith("el")
          else pagetitle + "ne" if pagetitle.endswith("on")
          else pagetitle + "te" if pagetitle.endswith("et")
          else pagetitle + "e" if pagetitle.endswith("ieur")
          else re.sub("teur$", "trice", pagetitle) if pagetitle.endswith("teur")
          else re.sub("eur$", "euse", pagetitle) if pagetitle.endswith("eur")
          else re.sub("eux$", "euse", pagetitle) if pagetitle.endswith("eux")
          else re.sub("if$", "ive", pagetitle) if pagetitle.endswith("if")
          else re.sub("c$", "que", pagetitle) if pagetitle.endswith("c")
          else pagetitle + "e")
      if re.search("(el|on|et|[^i]eur|eux|if|c)$", pagetitle) and not getparam(t, "f") and g != "mf":
        pagemsg("WARNING: Found suffix -el/-on/-et/-[^i]eur/-eux/-if/-c and no f= or 1=mf: %s" % unicode(t))
      if getparam(t, "f") == expected_fem:
        rmparam(t, "f")
        notes.append("remove redundant f=")
      fem = getparam(t, "f") or expected_fem
      if not fem.endswith("e"):
        if not getparam(t, "fp"):
          pagemsg("WARNING: Found f=%s not ending with -e and no fp=: %s" %
              (fem, unicode(t)))
        continue
      expected_fp = fem + "s"
      if getparam(t, "fp") == expected_fp:
        rmparam(t, "fp")
        notes.append("remove redundant fp=")
      if getparam(t, "fp") and not getparam(t, "f"):
        pagemsg("WARNING: Found fp=%s and no f=: %s" % (getparam(t, "fp"),
          unicode(t)))
        continue
      if getparam(t, "fp") == fem:
        pagemsg("WARNING: Found fp=%s same as fem=%s: %s" % (getparam(t, "fp"),
          fem, unicode(t)))
        continue
      if pagetitle.endswith("e") and not getparam(t, "f") and not getparam(t, "fp"):
        if g == "mf":
          rmparam(t, "1")
          notes.append("remove redundant 1=mf")
        g = "mf"
      if g == "mf":
        f = getparam(t, "f")
        if f:
          pagemsg("WARNING: Found f=%s and 1=mf: %s" % (f, unicode(t)))
        mp = getparam(t, "mp")
        if mp:
          pagemsg("WARNING: Found mp=%s and 1=mf: %s" % (mp, unicode(t)))
        fp = getparam(t, "fp")
        if fp:
          pagemsg("WARNING: Found fp=%s and 1=mf: %s" % (fp, unicode(t)))
        if f or mp or fp:
          continue
        expected_p = (pagetitle if re.search("[sx]$", pagetitle)
            else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al")
            else pagetitle + "s")
        if getparam(t, "p") == expected_p:
          rmparam(t, "p")
          notes.append("remove redundant p=")
      elif getparam(t, "p"):
        pagemsg("WARNING: Found unexpected p=%s: %s" % (getparam(t, "p"),
          unicode(t)))
      if not re.search("[ -]", pagetitle) and (getparam(t, "f") or
          getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")):
        pagemsg("Found remaining explicit feminine or plural in single-word base form: %s"
            % unicode(t))
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replacing %s with %s" % (origt, newt))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 59
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  manual_ppp_forms = ["past_pasv_part", "past_pasv_part2", "past_pasv_part3",
    "past_pasv_part4", "ppp", "ppp2", "ppp3", "ppp4"]
  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    tname = unicode(t.name)
    if tname == "ru-conj":
      manual_ppps = []
      for form in manual_ppp_forms:
        ppp = getparam(t, form)
        if ppp and ppp != "-":
          manual_ppps.append(ppp)
      if not manual_ppps:
        continue
      if [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
        continue
      curvariant = getparam(t, "2")
      if "+p" in curvariant or "(7)" in curvariant or "(8)" in curvariant:
        pagemsg("WARNING: Found both manual PPP and PPP variant, something wrong: %s" %
            unicode(t))
        continue
      t2 = blib.parse_text(unicode(t)).filter_templates()[0]
      for form in manual_ppp_forms:
        rmparam(t2, form)
      variants_to_try = ["+p"]
      if u"ё" in re.sub(u"ённый$", "", manual_ppps[0]):
        variants_to_try.append(u"+pё")
      if u"жденный" in manual_ppps[0] or u"ждённый" in manual_ppps[0]:
        variants_to_try.append(u"+pжд")
      notsamemsgs = []
      for variant in variants_to_try:
        t2.add("2", curvariant + variant)
        tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t2))
        result = expand_text(tempcall)
        if not result:
          pagemsg("WARNING: Error generating forms, skipping")
          continue
        args = rulib.split_generate_args(result)
        if "past_pasv_part" not in args:
          pagemsg("WARNING: Something wrong, no past passive participle generated: %s" % unicode(t))
          continue
        auto_ppps = []
        for form in manual_ppp_forms:
          if form in args:
            for ppp in re.split(",", args[form]):
              if ppp and ppp != "-":
                auto_ppps.append(ppp)
        if manual_ppps == auto_ppps:
          pagemsg("Manual PPP's %s same as auto-generated PPP's, switching to auto"
              % ",".join(manual_ppps))
          for form in manual_ppp_forms:
            rmparam(t, form)
          t.add("2", curvariant + variant)
          notes.append("replaced manual PPP's with variant %s" % variant)
          break
        else:
          notsamemsgs.append("WARNING: Manual PPP's %s not same as auto-generated PPP's %s: %s" %
            (",".join(manual_ppps), ",".join(auto_ppps), unicode(t)))
      else: # no break in for loop
        for m in notsamemsgs:
          pagemsg(m)

    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Esempio n. 60
0
def process_page(index, page, fix_missing_plurals):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping")
        return

    text = unicode(page.text)

    notes = []
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        name = unicode(t.name)
        if name == "head" and getparam(t, "1") == "fr":
            headtype = getparam(t, "2")
            fixed_plural_warning = False
            if headtype == "noun":
                head = getparam(t, "head")
                g = getparam(t, "g")
                g2 = getparam(t, "g2")
                plural = ""
                if getparam(t, "3") == "plural":
                    plural = getparam(t, "4")
                unrecognized_params = False
                for param in t.params:
                    pname = unicode(param.name)
                    if pname in ["1", "2", "head", "g", "g2", "sort"
                                 ] or plural and pname in ["3", "4"]:
                        pass
                    else:
                        unrecognized_params = True
                        break
                if unrecognized_params:
                    pagemsg(
                        "WARNING: Unrecognized parameters in %s, skipping" %
                        unicode(t))
                    continue
                if not g:
                    pagemsg("WARNING: No gender given in %s, skipping" %
                            unicode(t))
                    continue
                found_feminine_noun = False
                if g == "f" and not g2 and not plural:
                    for tt in parsed.filter_templates():
                        if (unicode(tt.name) == "feminine noun of"
                                and getparam(tt, "lang") == "fr"):
                            found_feminine_noun = True
                if found_feminine_noun:
                    pagemsg("Found 'feminine noun of', assuming countable")
                elif g not in ["m-p", "f-p"] and not plural:
                    if fix_missing_plurals:
                        pagemsg(
                            "WARNING: No plural given in %s, assuming default plural, PLEASE REVIEW"
                            % unicode(t))
                        fixed_plural_warning = True
                    else:
                        pagemsg("WARNING: No plural given in %s, skipping" %
                                unicode(t))
                        continue
                rmparam(t, "4")
                rmparam(t, "3")
                rmparam(t, "2")
                rmparam(t, "1")
                rmparam(t, "head")
                rmparam(t, "g")
                rmparam(t, "g2")
                rmparam(t, "sort")
                t.name = "fr-noun"
                if head:
                    t.add("head", head)
                t.add("1", g)
                if g2:
                    t.add("g2", g2)
                if plural:
                    t.add("2", plural)
            elif headtype in ["proper noun", "proper nouns"]:
                head = getparam(t, "head")
                g = getparam(t, "g")
                g2 = getparam(t, "g2")
                remove_3 = False
                if not g and getparam(t, "3") in ["m", "f", "m-p", "f-p"]:
                    g = getparam(t, "3")
                    remove_3 = True
                unrecognized_params = False
                for param in t.params:
                    pname = unicode(param.name)
                    if pname in ["1", "2", "head", "g", "g2", "sort"
                                 ] or remove_3 and pname in ["3"]:
                        pass
                    else:
                        unrecognized_params = True
                        break
                if unrecognized_params:
                    pagemsg(
                        "WARNING: Unrecognized parameters in %s, skipping" %
                        unicode(t))
                    continue
                if not g:
                    pagemsg("WARNING: No gender given in %s, skipping" %
                            unicode(t))
                    continue
                rmparam(t, "3")
                rmparam(t, "2")
                rmparam(t, "1")
                rmparam(t, "head")
                rmparam(t, "g")
                rmparam(t, "g2")
                rmparam(t, "sort")
                t.name = "fr-proper noun"
                if head:
                    t.add("head", head)
                t.add("1", g)
                if g2:
                    t.add("g2", g2)
            elif headtype in ["adjective", "adjectives"]:
                if getparam(t, "3") in ["invariable", "invariant"]:
                    params = dict(
                        (unicode(p.name), unicode(p.value)) for p in t.params)
                    del params["1"]
                    del params["2"]
                    del params["3"]
                    if getparam(t, "g") == "m" and getparam(t, "g2") == "f":
                        del params["g"]
                        del params["g2"]
                    if not params:
                        rmparam(t, "g2")
                        rmparam(t, "g")
                        rmparam(t, "3")
                        rmparam(t, "2")
                        rmparam(t, "1")
                        t.name = "fr-adj"
                        t.add("inv", "y")
                    else:
                        pagemsg(
                            "WARNING: Unrecognized parameters in %s, skipping"
                            % unicode(t))
                else:
                    pagemsg(
                        "WARNING: Unrecognized parameters in %s, skipping" %
                        unicode(t))
            elif headtype in [
                    "adjective form", "verb form", "verb forms",
                    "interjection", "preposition", "prefix", "prefixes",
                    "suffix", "suffixes"
            ]:
                headtype_supports_g = headtype in [
                    "adjective form", "suffix", "suffixes"
                ]
                head = getparam(t, "head")
                unrecognized_params = False
                for param in t.params:
                    pname = unicode(param.name)
                    if pname in ["1", "2", "head", "sort"
                                 ] or headtype_supports_g and pname == "g":
                        pass
                    else:
                        unrecognized_params = True
                        break
                if unrecognized_params:
                    pagemsg(
                        "WARNING: Unrecognized parameters in %s, skipping" %
                        unicode(t))
                    continue
                rmparam(t, "sort")
                rmparam(t, "head")
                rmparam(t, "2")
                rmparam(t, "1")
                t.name = (
                    "fr-adj-form" if headtype == "adjective form" else
                    "fr-verb-form" if headtype in ["verb form", "verb forms"]
                    else "fr-intj" if headtype == "interjection" else
                    "fr-prep" if headtype == "preposition" else
                    "fr-prefix" if headtype in ["prefix", "prefixes"] else
                    "fr-suffix"  # if headtype in ["suffix", "suffixes"]
                )
                if head:
                    t.add("head", head)

            newt = unicode(t)
            if origt != newt:
                pagemsg("Replacing %s with %s" % (origt, newt))
                notes.append(
                    "replaced {{head|fr|%s}} with {{%s}}%s" %
                    (headtype, unicode(t.name),
                     " (NEEDS REVIEW)" if fixed_plural_warning else ""))

    return unicode(parsed), notes