def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Remove gender from adjective forms
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form":
          origt = unicode(t)
          rmparam(t, "g")
          rmparam(t, "g2")
          rmparam(t, "g3")
          rmparam(t, "g4")
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("remove gender from adjective forms")
      sections[j] = unicode(parsed)
  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Example #2
0
def process_page(index, page, save, verbose, romaji_to_keep):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]:
      origt = unicode(t)

      # Remove old script code
      p1 = getparam(t, "1")
      if p1 in ["r", "h", "ka", "k", "s", "ky", "kk"]:
        pagemsg("Removing 1=%s: %s" % (p1, unicode(t)))
        notes.append("remove 1=%s from %s" % (p1, tname))
        rmparam(t, "1")
        for param in t.params:
          pname = unicode(param.name)
          if re.search(r"^[0-9]+$", pname):
            param.name = str(int(pname) - 1)
            param.showkey = False

      # Convert hira= and/or kata= to numbered param. The complexity is
      # from ensuring that the numbered params always go before the
      # non-numbered ones.
      if t.has("hira") or t.has("kata"):
        # Fetch the numbered and non-numbered params, skipping blank
        # numbered ones and converting hira and kata to numbered
        numbered_params = []
        non_numbered_params = []
        for param in t.params:
          pname = unicode(param.name)
          if re.search(r"^[0-9]+$", pname):
            val = unicode(param.value)
            if val:
              numbered_params.append(val)
          elif pname not in ["hira", "kata"]:
            non_numbered_params.append((pname, param.value))
        hira = getparam(t, "hira")
        if hira:
          numbered_params.append(hira)
          pagemsg("Moving hira=%s to %s=: %s" % (hira, len(numbered_params),
            unicode(t)))
          notes.append("move hira= to %s= in %s" % (len(numbered_params),
            tname))
        kata = getparam(t, "kata")
        if kata:
          numbered_params.append(kata)
          pagemsg("Moving kata=%s to %s=: %s" % (kata, len(numbered_params),
            unicode(t)))
          notes.append("move kata= to %s= in %s" % (len(numbered_params),
            tname))
        del t.params[:]
        # Put back numbered params, then non-numbered params.
        for i, param in enumerate(numbered_params):
          t.add(str(i+1), param)
        for name, value in non_numbered_params:
          t.add(name, value)

      # Remove rom= if not in list of pages to keep rom=
      if t.has("rom"):
        if pagetitle in romaji_to_keep:
          pagemsg("Keeping rom=%s because in romaji_to_keep: %s" % (
            getparam(t, "rom"), unicode(t)))
        else:
          pagemsg("Removing rom=%s: %s" % (getparam(t, "rom"), unicode(t)))
          rmparam(t, "rom")
          notes.append("remove rom= from %s" % tname)

      # Remove hidx=
      if t.has("hidx"):
        pagemsg("Removing hidx=%s: %s" % (getparam(t, "hidx"), unicode(t)))
        rmparam(t, "hidx")
        notes.append("remove hidx= from %s" % tname)

      newt = unicode(t)
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Remove blank form codes and canonicalize position of lang=, tr=
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          # Fetch the numbered params starting with 3, skipping blank ones
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          # Fetch param 1 and param 2, and non-numbered params except lang=
          # and nocat=.
          param1 = getparam(t, "1")
          param2 = getparam(t, "2")
          tr = getparam(t, "tr")
          nocat = getparam(t, "nocat")
          non_numbered_params = []
          for param in t.params:
            pname = unicode(param.name)
            if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]:
              non_numbered_params.append((pname, param.value))
          # Erase all params.
          del t.params[:]
          # Put back lang, param 1, tr, param 2, then the replacements for the
          # higher numbered params, then the non-numbered params.
          t.add("lang", "ru")
          t.add("1", param1)
          if tr:
            t.add("tr", tr)
          t.add("2", param2)
          for i, param in enumerate(numbered_params):
            t.add(str(i+3), param)
          for name, value in non_numbered_params:
            t.add(name, value)
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("removed any blank form codes and maybe rearranged lang=, tr=")
            if nocat:
              notes.append("removed nocat=")
      sections[j] = unicode(parsed)

      # Convert 'prep' to 'pre', etc.
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          for frm, to in [
              ("nominative", "nom"), ("accusative", "acc"),
              ("genitive", "gen"), ("dative", "dat"),
              ("instrumental", "ins"),
              ("prep", "pre"), ("prepositional", "pre"),
              ("vocative", "voc"), ("locative", "loc"), ("partitive", "par"),
              ("singular", "s"), ("(singular)", "s"),
              ("plural", "p"), ("(plural)", "p"),
              ("inanimate", "in"), ("animate", "an"),
              ]:
            origt = unicode(t)
            for i in xrange(3,20):
              val = getparam(t, str(i))
              if val == frm:
                t.add(str(i), to)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("converted '%s' form code to '%s'" % (frm, to))
      sections[j] = unicode(parsed)

      # Rearrange order of s|gen, p|nom etc. to gen|s, nom|p etc.
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          if (getparam(t, "3") in ["s", "p"] and
              getparam(t, "4") in ["nom", "gen", "dat", "acc", "ins", "pre", "voc", "loc", "par"] and
              not getparam(t, "5")):
            origt = unicode(t)
            number = getparam(t, "3")
            case = getparam(t, "4")
            t.add("3", case)
            t.add("4", number)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("converted '%s|%s' to '%s|%s'" %
                  (number, case, case, number))
      sections[j] = unicode(parsed)

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, fix, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  notes = []

  parsed = blib.parse(page)
  headword_aspects = set()
  found_multiple_headwords = False
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname == "ru-verb":
      if headword_aspects:
        found_multiple_headwords = True
      headword_aspects = set()
      aspect = getparam(t, "2")
      if aspect in ["pf", "impf"]:
        headword_aspects.add(aspect)
      elif aspect == "both":
        headword_aspects.add("pf")
        headword_aspects.add("impf")
      elif aspect == "?":
        pagemsg("WARNING: Found aspect '?'")
      else:
        pagemsg("WARNING: Found bad aspect value '%s' in ru-verb" % aspect)
    elif tname in ["ru-conj", "ru-conj-old"]:
      aspect = re.sub("-.*", "", getparam(t, "1"))
      if aspect not in ["pf", "impf"]:
        pagemsg("WARNING: Found bad aspect value '%s' in ru-conj" %
            getparam(t, "1"))
      else:
        if not headword_aspects:
          pagemsg("WARNING: No ru-verb preceding ru-conj: %s" % unicode(t))
        elif aspect not in headword_aspects:
          pagemsg("WARNING: ru-conj aspect %s not in ru-verb aspect %s" %
              (aspect, ",".join(headword_aspects)))
  if fix:
    if found_multiple_headwords:
      pagemsg("WARNING: Multiple ru-verb headwords, not fixing")
    elif not headword_aspects:
      pagemsg("WARNING: No ru-verb headwords, not fixing")
    elif len(headword_aspects) > 1:
      pagemsg("WARNING: Multiple aspects in ru-verb, not fixing")
    else:
      for t in parsed.filter_templates():
        origt = unicode(t)
        tname = unicode(t.name)
        if tname in ["ru-conj", "ru-conj-old"]:

          param1 = getparam(t, "1")
          param1 = re.sub("^(pf|impf)((-.*)?)$", r"%s\2" % list(headword_aspects)[0], param1)
          t.add("1", param1)
        newt = unicode(t)
        if origt != newt:
          pagemsg("Replaced %s with %s" % (origt, newt))
          notes.append("overrode conjugation aspect with %s" % list(headword_aspects)[0])

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Example #5
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Try to canonicalize existing 'conjugation of'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "conjugation of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          t.name = "inflection of"
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("converted 'conjugation of' to 'inflection of'")
      sections[j] = unicode(parsed)

      # Try to split 'inflection of' containing 'present or future' into two
      # defns
      newsec = re.sub(r"^# \{\{inflection of\|(.*?)\|present or future\|(.*?)\}\}$",
          r"# {{inflection of|\1|pres|\2}}\n# {{inflection of|\1|fut|\2}}",
          sections[j], 0, re.M)
      if newsec != sections[j]:
        notes.append("split 'present or future' form code into two defns with 'pres' and 'fut'")
        sections[j] = newsec

      # Convert 'indc' to 'ind', 'futr' to 'fut', 'perfective' and
      # '(perfective)' to 'pfv', 'imperfective' and '(imperfective)' to 'impfv',
      # 'impr' to 'imp'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          for frm, to in [
              ("indc", "ind"), ("indicative", "ind"),
              ("futr", "fut"), ("future", "fut"),
              ("impr", "imp"), ("imperative", "imp"),
              ("perfective", "pfv"), ("(perfective)", "pfv"),
              ("imperfective", "impfv"), ("(imperfective)", "impfv"),
              ("singular", "s"), ("(singular)", "s"),
              ("plural", "p"), ("(plural)", "p"),
              ("masculine", "m"), ("(masculine)", "m"),
              ("feminine", "f"), ("(feminine)", "f"),
              ("neuter", "n"), ("(neuter)", "n"), ("neutral", "n"), ("(neutral)", "n"),
              ]:
            origt = unicode(t)
            for i in xrange(3,20):
              val = getparam(t, str(i))
              if val == frm:
                t.add(str(i), to)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("converted '%s' form code to '%s'" % (frm, to))
      sections[j] = unicode(parsed)

      # Remove blank form codes and canonicalize position of lang=, tr=
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          origt = unicode(t)
          # Fetch the numbered params starting with 3, skipping blank ones
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          # Fetch param 1 and param 2, and non-numbered params except lang=
          # and nocat=.
          param1 = getparam(t, "1")
          param2 = getparam(t, "2")
          tr = getparam(t, "tr")
          nocat = getparam(t, "nocat")
          non_numbered_params = []
          for param in t.params:
            pname = unicode(param.name)
            if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]:
              non_numbered_params.append((pname, param.value))
          # Erase all params.
          del t.params[:]
          # Put back lang, param 1, param 2, tr, then the replacements for the
          # higher numbered params, then the non-numbered params.
          t.add("lang", "ru")
          t.add("1", param1)
          t.add("2", param2)
          if tr:
            t.add("tr", tr)
          for i, param in enumerate(numbered_params):
            t.add(str(i+3), param)
          for name, value in non_numbered_params:
            t.add(name, value)
          newt = unicode(t)
          if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
            notes.append("removed any blank form codes and maybe rearranged lang=, tr=")
            if nocat:
              notes.append("removed nocat=")
      sections[j] = unicode(parsed)

      # Try to canonicalize 'inflection of' involving the imperative,
      # present, future
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          # Fetch the numbered params starting with 3
          numbered_params = []
          for i in xrange(3,20):
            val = getparam(t, str(i))
            if val:
              numbered_params.append(val)
          while len(numbered_params) > 0 and not numbered_params[-1]:
            del numbered_params[-1]
          # Now canonicalize
          numparamstr = "/".join(numbered_params)
          numparamset = set(numbered_params)
          canon_params = []
          while True:
            if numparamset == {'s', 'pfv', 'imp'}:
              canon_params = ['2', 's', 'pfv', 'imp']
            elif numparamset == {'s', 'impfv', 'imp'}:
              canon_params = ['2', 's', 'impfv', 'imp']
            elif numparamset == {'s', 'imp'}:
              canon_params = ['2', 's', 'imp']
            elif numparamset == {'p', 'pfv', 'imp'}:
              canon_params = ['2', 'p', 'pfv', 'imp']
            elif numparamset == {'p', 'impfv', 'imp'}:
              canon_params = ['2', 'p', 'impfv', 'imp']
            elif numparamset == {'p', 'imp'}:
              canon_params = ['2', 'p', 'imp']
            elif numparamset == {'m', 's', 'past'}:
              canon_params = ['m', 's', 'past', 'ind']
            elif numparamset == {'f', 's', 'past'}:
              canon_params = ['f', 's', 'past', 'ind']
            elif numparamset == {'n', 's', 'past'}:
              canon_params = ['n', 's', 'past', 'ind']
            elif numparamset == {'p', 'past'}:
              canon_params = ['p', 'past', 'ind']
            else:
              m = re.search(r"^([123])/([sp])/(pres|fut)$", numparamstr)
              if m:
                canon_params = [m.group(1), m.group(2), m.group(3), "ind"]
            break
          if canon_params:
            origt = unicode(t)
            # Fetch param 1 and param 2. Erase all numbered params.
            # Put back param 1 and param 2 (this will put them after lang=ru),
            # then the replacements for the higher params.
            param1 = getparam(t, "1")
            param2 = getparam(t, "2")
            for i in xrange(19,0,-1):
              rmparam(t, str(i))
            t.add("1", param1)
            t.add("2", param2)
            for i, param in enumerate(canon_params):
              t.add(str(i+3), param)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params))
            else:
              pagemsg("Apparently already canonicalized: %s" % newt)
      sections[j] = unicode(parsed)

      # Try to add 'inflection of' to raw-specified participial inflection
      def add_participle_inflection_of(m):
        prefix = m.group(1)
        tense = m.group(2).lower()
        if tense == "present":
          tense = "pres"
        voice = m.group(3).lower()
        if voice == "active":
          voice = "act"
        elif voice == "passive":
          voice = "pass"
        elif voice == "adverbial":
          voice = "adv"
        lemma = m.group(4)
        retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|part}}" % (lemma, tense, voice)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s/%s/part" % (tense, voice))
        return retval
      newsec = re.sub(r"(# |\()'*(present|past) participle (active|passive|adverbial) of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*(present|past) (active|passive|adverbial) participle of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified past inflection
      def add_past_inflection_of(m):
        prefix = m.group(1)
        gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f",
            "neuter":"n", "neutral":"n", "plural":"p"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||%s%s|past|ind}}" % (lemma, gender, gender != "p" and "|s" or "")
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s%s/past/ind" % (gender, gender != "p" and "/s" or ""))
        return retval
      newsec = re.sub(r"(# |\()'*(male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)past (?:tense |form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*past(?:-tense| tense|) (male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)(?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified imperative inflection
      def add_imper_inflection_of(m):
        prefix = m.group(1)
        number = {"singular":"s", "plural":"p"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||2|%s|imp}}" % (lemma, number)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for 2/%s/imp" % number)
        return retval
      newsec = re.sub(r"(# |\()'*(singular|plural) imperative (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of,
          sections[j], 0, re.I)
      newsec = re.sub(r"(# |\()'*imperative (singular|plural) (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of,
          newsec, 0, re.I)
      sections[j] = newsec

      # Try to add 'inflection of' to raw-specified finite pres/fut inflection
      def add_pres_fut_inflection_of(m):
        prefix = m.group(1)
        person = m.group(2)[0]
        number = {"singular":"s", "plural":"p"}[m.group(3).lower()]
        tense = {"present":"pres", "future":"fut"}[m.group(4).lower()]
        lemma = m.group(5)
        retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|%s|ind}}" % (lemma, person, number, tense)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for %s/%s/%s/ind" % (person, number, tense))
        return retval
      newsec = re.sub(r"(# |\()'*(1st|2nd|3rd)(?:-person| person|) (singular|plural) (present|future) (?:tense |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_pres_fut_inflection_of,
          sections[j], 0, re.I)
      sections[j] = newsec

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Example #6
0
def process_line(index, line, add_passive_of, override_etym, save, verbose):
  def error(text):
    errmsg("ERROR: Processing line: %s" % line)
    errmsg("ERROR: %s" % text)
    assert False

  def check_stress(word):
    word = re.sub(r"|.*", "", word)
    if word.startswith("-") or word.endswith("-"):
      # Allow unstressed prefix (e.g. разо-) and unstressed suffix (e.g. -овать)
      return
    if rulib.needs_accents(word, split_dash=True):
      error("Word %s missing an accent" % word)

  # Skip lines consisting entirely of comments
  if line.startswith("#"):
    return
  if line.startswith("!"):
    override_etym = True
    line = line[1:]
  # If the second element (the etymology) begins with raw:, allow spaces in the remainder to be
  # included as part of the second element.
  els = do_split(r"\s+", line, 1)
  if len(els) != 2:
    error("Expected two fields, saw %s" % len(els))
  if not els[1].startswith("raw:"):
    els = do_split(r"\s+", line)
  # Replace _ with space and \u
  els = [el.replace("_", " ").replace(r"\u", "_") for el in els]
  if len(els) != 2:
    error("Expected two fields, saw %s" % len(els))
  accented_term = els[0]
  term = rulib.remove_accents(accented_term)
  etym = els[1]

  pagetitle = term

  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  # Handle etymology
  adjformtext = ""
  if etym == "?":
    error("Etymology consists of bare question mark")
  elif etym == "-":
    etymtext = "===Etymology===\n{{rfe|lang=ru}}\n\n"
  elif etym == "--":
    etymtext = ""
  elif re.search(r"^(part|adj|partadj)([fnp]):", etym):
    m = re.search(r"^(part|adj|partadj)([fnp]):(.*)", etym)
    forms = {"f":["nom|f|s"], "n":["nom|n|s", "acc|n|s"], "p":["nom|p", "in|acc|p"]}
    infleclines = ["# {{inflection of|lang=ru|%s||%s}}" %
        (m.group(3), form) for form in forms[m.group(2)]]
    if m.group(1) in ["adj", "partadj"]:
      adjinfltext = """===Adjective===
{{head|ru|adjective form|head=%s%s}}

%s\n\n""" % (headterm, trtext, "\n".join(infleclines))
    else:
      adjinfltext = ""
    if m.group(1) in ["part", "partadj"]:
      partinfltext = """===Participle===
{{head|ru|participle form|head=%s%s}}

%s\n\n""" % (headterm, trtext, "\n".join(infleclines))
    else:
      partinfltext = ""
    adjformtext = partinfltext + adjinfltext
    etymtext = ""
  else:
    if etym.startswith("acr:"):
      _, fullexpr, meaning = do_split(":", etym)
      etymtext = "{{ru-etym acronym of|%s||%s}}." % (fullexpr, meaning)
    elif etym.startswith("deverb:"):
      _, sourceterm = do_split(":", etym)
      etymtext = "Deverbal from {{m|ru|%s}}." % sourceterm
    elif etym.startswith("back:"):
      _, sourceterm = do_split(":", etym)
      etymtext = "{{back-form|lang=ru|%s}}" % sourceterm
    elif etym.startswith("raw:"):
      etymtext = re.sub(", *", ", ", re.sub("^raw:", "", etym))
    elif ":" in etym and "+" not in etym:
      if etym.startswith("?"):
        prefix = "Perhaps borrowed from "
        etym = re.sub(r"^\?", "", etym)
      elif etym.startswith("<<"):
        prefix = "Ultimately borrowed from "
        etym = re.sub(r"^<<", "", etym)
      else:
        prefix = "Borrowed from "
      m = re.search(r"^([a-zA-Z.-]+):(.*)", etym)
      if not m:
        error("Bad etymology form: %s" % etym)
      etymtext = "%s{{bor|ru|%s|%s}}." % (prefix, m.group(1), m.group(2))
    else:
      prefix = ""
      suffix = ""
      if etym.startswith("?"):
        prefix = "Perhaps from "
        suffix = "."
        etym = re.sub(r"^\?", "", etym)
      elif etym.startswith("<<"):
        prefix = "Ultimately from "
        suffix = "."
        etym = re.sub(r"^<<", "", etym)
      m = re.search(r"^([a-zA-Z.-]+):(.*)", etym)
      if m:
        langtext = "|lang1=%s" % m.group(1)
        etym = m.group(2)
      else:
        langtext = ""
      etymtext = "%s{{affix|ru|%s%s}}%s" % (prefix,
          "|".join(do_split(r"\+", re.sub(", *", ", ", etym))), langtext,
          suffix)
    etymbody = etymtext + "\n\n"
    etymtext = "===Etymology===\n" + etymbody

  if not etymtext:
    pagemsg("No etymology text, skipping")

  # Load page
  page = pywikibot.Page(site, pagetitle)

  if not blib.try_repeatedly(lambda: page.exists(), pagemsg,
      "check page existence"):
    pagemsg("Page doesn't exist, can't add etymology")
    return
    
  pagemsg("Adding etymology")
  notes = []
  pagetext = unicode(page.text)

  # Split into sections
  splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M)
  # Extract off pagehead and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Go through each section in turn, looking for existing Russian section
  for i in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[i], re.M)
    if not m:
      pagemsg("Can't find language name in text: [[%s]]" % (sections[i]))
    elif m.group(1) == "Russian":
      if override_etym:
        subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M)

        replaced_etym = False
        for j in xrange(2, len(subsections), 2):
          if "==Etymology==" in subsections[j - 1] or "==Etymology 1==" in subsections[j - 1]:
            subsections[j] = etymbody
            replaced_etym = True
            break

        if replaced_etym:
          sections[i] = "".join(subsections)
          newtext = "".join(sections)
          notes.append("replace Etymology section in Russian lemma with manually specified etymology")
          break

      if "==Etymology==" in sections[i] or "==Etymology 1==" in sections[i]:
        errandpagemsg("WARNING: Already found etymology, skipping")
        return

      subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M)
          
      insert_before = 1
      if "===Alternative forms===" in subsections[insert_before]:
        insert_before += 2

      subsections[insert_before] = etymtext + subsections[insert_before]
      sections[i] = "".join(subsections)
      if add_passive_of:
        active_term = rulib.remove_monosyllabic_accents(
          re.sub(u"с[яь]$", "", accented_term))
        sections[i] = re.sub(r"(^(#.*\n)+)",
          r"\1# {{passive of|lang=ru|%s}}\n" % active_term,
          sections[i], 1, re.M)

      newtext = pagehead + "".join(sections)
      notes.append("add (manually specified) Etymology section to Russian lemma")
      break
  else:
    errandpagemsg("WARNING: Can't find Russian section, skipping")
    return

  if newtext != pagetext:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext))
    assert notes
    comment = "; ".join(group_notes(notes))
    if save:
      blib.safe_page_save(page, comment, errandpagemsg)
    else:
      pagemsg("Would save with comment = %s" % comment)
Example #7
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []
  already_canonicalized = False
  found_short_inflection_of = False
  warned_about_short = False

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Try to canonicalize existing 'inflection of'
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru":
          # Fetch the numbered params starting with 3
          numbered_params = []
          for i in xrange(3,20):
            numbered_params.append(getparam(t, str(i)))
          while len(numbered_params) > 0 and not numbered_params[-1]:
            del numbered_params[-1]
          # Now canonicalize
          numparamstr = "/".join(numbered_params)
          canon_params = []
          while True:
            m = (re.search(r"^([mfn])/(?:s|\(singular\))/short(?: form|)$", numparamstr) or
                 re.search(r"^(?:s|\(singular\))/([mfn])/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/([mfn])/(?:s|\(singular\))$", numparamstr) or
                 re.search(r"^short(?: form|)/(?:s|\(singular\))/([mfn])$", numparamstr) or
                 re.search(r"^([mfn])/short(?: form|)/(?:s|\(singular\))$", numparamstr) or
                 re.search(r"^(?:s|\(singular\))/short(?: form|)/([mfn])$", numparamstr) or
                 re.search(r"^([mfn])/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/([mfn])$", numparamstr)
                 )
            if m:
              found_short_inflection_of = True
              canon_params = ["short", m.group(1), "s"]
              break
            m = (re.search(r"^(?:p|\(plural\))/short(?: form|)$", numparamstr) or
                 re.search(r"^short(?: form|)/(?:p|\(plural\))$", numparamstr)
                 )
            if m:
              found_short_inflection_of = True
              canon_params = ["short", "p"]
              break
            if "short" in numbered_params or "short form" in numbered_params:
              found_short_inflection_of = True
              warned_about_short = True
              pagemsg("WARNING: Apparent short-form 'inflection of' but can't canonicalize: %s" %
                  unicode(t))
            break
          if canon_params:
            origt = unicode(t)
            # Fetch param 1 and param 2. Erase all numbered params.
            # Put back param 1 and param 2 (this will put them after lang=ru),
            # then the replacements for the higher params.
            param1 = getparam(t, "1")
            param2 = getparam(t, "2")
            for i in xrange(19,0,-1):
              rmparam(t, str(i))
            t.add("1", param1)
            t.add("2", param2)
            for i, param in enumerate(canon_params):
              t.add(str(i+3), param)
            newt = unicode(t)
            if origt != newt:
              pagemsg("Replaced %s with %s" % (origt, newt))
              notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params))
            else:
              pagemsg("Apparently already canonicalized: %s" % newt)
              already_canonicalized = True
      sections[j] = unicode(parsed)

      # Try to add 'inflection of' to raw-specified singular inflection
      def add_sing_inflection_of(m):
        prefix = m.group(1)
        gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f",
            "neuter":"n", "neutral":"n"}[m.group(2).lower()]
        lemma = m.group(3)
        retval = prefix + "{{inflection of|lang=ru|%s||short|%s|s}}" % (lemma, gender)
        pagemsg("Replaced <%s> with %s" % (m.group(0), retval))
        notes.append("converted raw to 'inflection of' for short/%s/s" % gender)
        return retval
      newsec = re.sub(r"(# |\()'*(?:short |)(?:form of |)(masculine|male|feminine|female|neuter|neutral) (?:short |)(?:singular |)(?:short |)(?:form of|of|for)'* '*(?:\[\[|\{\{[lm]\|ru\|)(.*?)(?:\]\]|\}\})'*", add_sing_inflection_of,
          sections[j], 0, re.I)
      if newsec != sections[j]:
        found_short_inflection_of = True
      sections[j] = newsec

      if "short" in sections[j] and not found_short_inflection_of:
        m = re.search("^(.*short.*)$", sections[j], re.M)
        warned_about_short = True
        pagemsg("WARNING: Apparent raw-text short inflection, not converted: %s" %
            (m and m.group(1) or "Can't get line?"))

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)

  if not notes and not already_canonicalized:
    pagemsg("Skipping, no short form found%s" % (
      warned_about_short and " (warning issued)" or " (no warning)"))
Example #8
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  def check_for_translation_italics(val, orig):
    val = val.replace("'', ''", ", ")
    if re.search("(?<!')''(?!')", val):
      pagemsg("WARNING: Italics in translation <<%s>>: <<%s>>" % (val, orig))
    return val

  def check_for_stray_vertical_bar(val):
    split_on_paired_brackets_braces = re.split(r"\[\[[^\[\]]*\]\]|\{\{[^{}]*\}\}", val)
    for outside_bracket_brace in split_on_paired_brackets_braces:
      if "|" in outside_bracket_brace:
        pagemsg("WARNING: Stray vertical bar in Russian or English, can't handle: <<%s>>" % val)
        return True
    return False

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      # Try to convert multi-line usex using #:
      def multi_line_usex(m):
        ru, tr, en = m.groups()
        en = check_for_translation_italics(en, m.group(0))
        if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(tr) or check_for_stray_vertical_bar(en):
          return m.group(0)
        retval = "#: {{ru-ux|%s|tr=%s|%s}}" % (ru, tr, en)
        pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval))
        notes.append("converted raw multi-line usex to 'ru-ux'")
        return retval
      sections[j] = re.sub(r"^#: \{\{lang\|ru\|([^\n{}]*?)\}\}\n#:: (.*)\n#:::? (.*)$", multi_line_usex, sections[j], 0, re.M)

      # Try to convert multi-line usex using #*
      def multi_line_usex_hidden(m):
        ru, tr, en = m.groups()
        en = check_for_translation_italics(en, m.group(0))
        if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(tr) or check_for_stray_vertical_bar(en):
          return m.group(0)
        retval = "%s#*: {{ru-ux|%s|tr=%s|%s}}" % (ru, tr, en)
        pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval))
        notes.append("converted raw multi-line hidden usex to 'ru-ux'")
        return retval
      sections[j] = re.sub(r"^(#\* .*?\n)#\*: \{\{lang\|ru\|([^\n{}]*?)\}\}\n#\*:: ([^{}\n]*)\n#\*:::? ([^{}\n]*)$", multi_line_usex_hidden, sections[j], 0, re.M)

      # Try to convert single-line usex that uses {{lang}}, {{l}} or {{m}}
      for tempname in ["lang", "l", "m"]:
        def single_line_usex_lang_l_m(m):
          ru, en = m.groups()
          en = check_for_translation_italics(en, m.group(0))
          if tempname == "lang" or "[" in ru:
            if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(en):
              return m.group(0)
            retval = "#: {{ru-ux|%s|%s|inline=y}}" % (ru, en)
          else:
            if "|tr=" in ru:
              pagemsg("WARNING: Found |tr= in link, can't handle: <<%s>>" %
                  m.group(0))
              return m.group(0)
            # A single vertical bar in ru is allowed here; it will be handled
            # correctly because we wrap it in a raw link
            if check_for_stray_vertical_bar(en):
              return m.group(0)
            retval = "#: {{ru-ux|[[%s]]|%s|inline=y}}" % (ru, en)
          pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval))
          notes.append("converted raw single-line usex using {{%s}} to 'ru-ux'" % tempname)
          return retval
        # Version with ''...'' around the translation; do this first in case
        # we have bold (''') around the first Russian word and italics ('')
        # around the translation; in the opposite order, the bold will get
        # treated as italics
        sections[j] = re.sub(ur"^#:\*? \{\{%s\|ru\|([^\n{}]*?)\}\}(?: |\&nbsp;)(?:—|\&mdash;)(?: |\&nbsp;)''(.*?)''$" % tempname, single_line_usex_lang_l_m, sections[j], 0, re.M)
        # Version with ''...'' around the whole thing
        sections[j] = re.sub(ur"^#:\*? ''\{\{%s\|ru\|([^\n{}]*?)\}\}(?: |\&nbsp;)(?:—|\&mdash;)(?: |\&nbsp;)(.*?)''$" % tempname, single_line_usex_lang_l_m, sections[j], 0, re.M)
        # Version without ''...''
        sections[j] = re.sub(ur"^#:\*? \{\{%s\|ru\|([^\n{}]*?)\}\}(?: |\&nbsp;)(?:—|\&mdash;)(?: |\&nbsp;)(.*?)$" % tempname, single_line_usex_lang_l_m, sections[j], 0, re.M)

      # Try to convert single-line usex that is raw, maybe allowing braces
      # in the right side
      for allow_braces_on_right in [False, True]:
        maybe_exclude_braces = "" if allow_braces_on_right else "{}"
        allow_braces_msg = ", allowing braces on right side" if allow_braces_on_right else ""
        def single_line_usex_raw(m):
          ru, en = m.groups()
          en = check_for_translation_italics(en, m.group(0))
          if check_for_stray_vertical_bar(ru) or check_for_stray_vertical_bar(en):
            return m.group(0)
          retval = "#: {{ru-ux|%s|%s|inline=y}}" % (ru, en)
          pagemsg("Replaced <<%s>> with <<%s>>" % (m.group(0), retval))
          notes.append("converted pure raw single-line usex to 'ru-ux'%s" %
              allow_braces_msg)
          return retval
        # Version with ''...'' around the translation; do this first in case
        # we have bold (''') around the first Russian word and italics ('')
        # around the translation; in the opposite order, the bold will get
        # treated as italics
        sections[j] = re.sub(ur"^#:\*? ([^{}\n]*)(?: |\&nbsp;)(?:—|-|\&mdash;)(?: |\&nbsp;)''([^%s\n]*?)''$" % maybe_exclude_braces, single_line_usex_raw, sections[j], 0, re.M)
        # Version with ''...'' around the whole thing; the expression after
        # the '' is a disjunctive lookahead expression and says "(two single
        # quotes) either followed by 3 more quotes (combination bold+italic)
        # or not followed by any quote (to exclude bold = ''')
        sections[j] = re.sub(ur"^#:\*? ''(?:(?!')|(?='''))([^{}\n]*)(?: |\&nbsp;)(?:—|-|\&mdash;)(?: |\&nbsp;)([^%s\n]*?)''$" % maybe_exclude_braces, single_line_usex_raw, sections[j], 0, re.M)
        # Version without ''...''
        sections[j] = re.sub(ur"^#:\*? ([^{}\n]*)(?: |\&nbsp;)(?:—|-|\&mdash;)(?: |\&nbsp;)([^%s\n]*?)$" % maybe_exclude_braces, single_line_usex_raw, sections[j], 0, re.M)

  new_text = "".join(sections)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <<%s>> with <<%s>>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Example #9
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping page")
    return

  text = unicode(page.text)
  notes = []

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "noun form":
      if getparam(t, "3"):
        pagemsg("WARNING: Found param 3 in {{head|ru|noun form}}: %s" %
            unicode(t))
        return
      rmparam(t, "1")
      rmparam(t, "2")
      head = getrmparam(t, "head")
      head2 = getrmparam(t, "head2")
      tr = getrmparam(t, "tr")
      tr2 = getrmparam(t, "tr2")
      g = getrmparam(t, "g")
      g2 = getrmparam(t, "g2")
      g3 = getrmparam(t, "g3")
      if len(t.params) > 0:
        pagemsg("WARNING: Extra params in noun form template: %s" %
            unicode(t))
        return
      t.name = "ru-noun form"
      if head or g:
        t.add("1", head)
      if head2:
        t.add("head2", head2)
      if g:
        t.add("2", g)
      if g2:
        t.add("g2", g2)
      if g3:
        t.add("g3", g3)
      if tr:
        t.add("tr", tr)
      if tr2:
        t.add("tr2", tr2)
      newt = unicode(t)
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))
        notes.append("convert {{head|ru|noun form}} to {{ru-noun form}}")
    elif unicode(t.name) == "ru-noun form":
      if getparam(t, "head") and getparam(t, "1"):
        pagemsg("WARNING: ru-noun form has both params 1= and head=: %s" %
            unicode(t))
        return
      if getparam(t, "g") and getparam(t, "2"):
        pagemsg("WARNING: ru-noun form has both params 2= and g=: %s" %
            unicode(t))
        return
      head = getrmparam(t, "1") or getrmparam(t, "head")
      head2 = getrmparam(t, "head2")
      tr = getrmparam(t, "tr")
      tr2 = getrmparam(t, "tr2")
      g = getrmparam(t, "2") or getrmparam(t, "g")
      g2 = getrmparam(t, "g2")
      g3 = getrmparam(t, "g3")
      if len(t.params) > 0:
        pagemsg("WARNING: Extra params in noun form template: %s" %
            unicode(t))
        return
      if head or g:
        t.add("1", head)
      if head2:
        t.add("head2", head2)
      if g:
        t.add("2", g)
      if g2:
        t.add("g2", g2)
      if g3:
        t.add("g3", g3)
      if tr:
        t.add("tr", tr)
      if tr2:
        t.add("tr2", tr2)
      newt = unicode(t)
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))
        notes.append("canonicalize ru-noun form")

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Example #10
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist")
    return

  if ":" in pagetitle and not re.search(
      "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle):
    pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page")
    return

  text = unicode(page.text)
  notes = []

  subsections = re.split("(^==.*==\n)", text, 0, re.M)
  newtext = text

  def move_param(t, fr, to, frob_from=None):
    if t.has(fr):
      oldval = getparam(t, fr)
      if not oldval.strip():
        rmparam(t, fr)
        pagemsg("Removing blank param %s" % fr)
        return
      if frob_from:
        newval = frob_from(oldval)
        if not newval or not newval.strip():
          return
      else:
        newval = oldval

      if getparam(t, to).strip():
          pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s"
              % (fr, to, to, unicode(t)))
      elif oldval != newval:
        rmparam(t, to) # in case of blank param
        # If either old or new name is a number, use remove/add to automatically set the
        # showkey value properly; else it's safe to just change the name of the param,
        # which will preserve its location.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          tfr = t.get(fr)
          tfr.name = to
          tfr.value = newval
        pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to,
          newval.replace("\n", r"\n")))
      else:
        rmparam(t, to) # in case of blank param
        # See comment above.
        if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to):
          rmparam(t, fr)
          t.add(to, newval)
        else:
          t.get(fr).name = to
        pagemsg("%s -> %s" % (fr, to))

  def fix_page_params(t):
    origt = unicode(t)
    for param in ["page", "pages"]:
      pageval = getparam(t, param)
      if re.search(r"^\s*pp?\.\s*", pageval):
        pageval = re.sub(r"^(\s*)pp?\.\s*", r"\1", pageval)
        t.add(param, pageval)
        notes.append("remove p(p). from %s=" % param)
        pagemsg("remove p(p). from %s=" % param)
    if re.search(r"^[0-9]+$", getparam(t, "pages").strip()):
      move_param(t, "pages", "page")
    if re.search(r"^[0-9]+[-–—]$", getparam(t, "page").strip()):
      move_param(t, "page", "pages")
    return origt != unicode(t)

  def fix_cite_book_params(t):
    origt = unicode(t)
    if getparam(t, "origyear").strip() and getparam(t, "year").strip():
      if getparam(t, "year_published"):
        pagemsg("WARNING: Would set year_published= but is already present: %s"
            % unicode(t))
      else:
        rmparam(t, "year_published") # in case of blank param
        t.get("year").name = "year_published"
        t.get("origyear").name = "year"
        pagemsg("year -> year_published, origyear -> year")
    move_param(t, "origdate", "date")
    move_param(t, "origmonth", "month")
    def frob_isbn(idval):
      isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)"
      if re.search(isbn_re, idval, re.I):
        return re.sub(isbn_re, r"\1", idval, 0, re.I)
      elif re.search(r"^[0-9]", idval.strip()):
        return idval
      else:
        pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" %
            idval.replace("\n", r"\n"))
        return None
    move_param(t, "id", "isbn", frob_isbn)
    fix_page_params(t)
    return origt != unicode(t)

  def fix_cite_usenet_params(t):
    origt = unicode(t)
    move_param(t, "group", "newsgroup")
    move_param(t, "link", "url")
    return origt != unicode(t)

  def fix_quote_usenet_params(t):
    origt = unicode(t)
    monthday = getparam(t, "monthday").strip()
    year = getparam(t, "year").strip()
    if monthday and year:
      if getparam(t, "date"):
        pagemsg("WARNING: Would set date= but is already present: %s"
            % unicode(t))
      else:
        rmparam(t, "date") # in case of blank param
        param = t.get("monthday")
        param.name = "date"
        if re.search("^[0-9]+/[0-9]+$", monthday):
          param.value = "%s/%s" % (monthday, year)
        else:
          param.value = "%s %s" % (monthday, year)
        rmparam(t, "year")
        pagemsg("monthday/year -> date")
    move_param(t, "group", "newsgroup")
    move_param(t, "text", "passage")
    move_param(t, "6", "passage")
    move_param(t, "5", "url")
    move_param(t, "4", "newsgroup")
    move_param(t, "3", "title")
    move_param(t, "2", "author")
    move_param(t, "1", "date")
    return origt != unicode(t)

  def replace_in_reference(parsed, in_what):
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      origt = unicode(t)
      if tname.strip() in ["reference-journal", "reference-news"]:
        set_template_name(t, "cite-journal", tname)
        pagemsg("%s -> cite-journal" % tname.strip())
        notes.append("%s -> cite-journal" % tname.strip())
        fix_page_params(t)
        pagemsg("Replacing %s with %s in %s" %
            (origt, unicode(t), in_what))
      if tname.strip() == "reference-book":
        set_template_name(t, "cite-book", tname)
        pagemsg("reference-book -> cite-book")
        fixed_params = fix_cite_book_params(t)
        notes.append("reference-book -> cite-book%s" % (
          fixed_params and " and fix book cite params" or ""))
        pagemsg("Replacing %s with %s in %s" %
            (origt, unicode(t), in_what))

  for j in xrange(0, len(subsections), 2):
    parsed = blib.parse_text(subsections[j])
    if j > 0 and re.search(r"^===*References===*\n", subsections[j-1]):
      replace_in_reference(parsed, "==References== section")
      subsections[j] = unicode(parsed)
    else:
      for t in parsed.filter_tags():
        if unicode(t.tag) == "ref":
          tagparsed = mw.wikicode.Wikicode([t])
          replace_in_reference(tagparsed, "<ref>")
          subsections[j] = unicode(parsed)
    need_to_replace_double_quote_prefixes = False
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      origt = unicode(t)
      for fr, to in simple_replace:
        if tname.strip() == fr:
          set_template_name(t, to, tname)
          pagemsg("%s -> %s" % (fr, to))
          notes.append("%s -> %s" % (fr, to))
          fix_page_params(t)
          pagemsg("Replacing %s with %s" % (origt, unicode(t)))
      if tname.strip() in ["reference-journal", "reference-news"]:
        set_template_name(t, "quote-journal", tname)
        pagemsg("%s -> quote-journal" % tname.strip())
        notes.append("%s -> quote-journal" % tname.strip())
        fix_page_params(t)
        pagemsg("Replacing %s with %s outside of reference section" %
            (origt, unicode(t)))
      if tname.strip() == "reference-book":
        set_template_name(t, "quote-book", tname)
        pagemsg("reference-book -> cite-book")
        fixed_params = fix_cite_book_params(t)
        notes.append("reference-book -> cite-book%s" % (
          fixed_params and " and fix book cite params" or ""))
        pagemsg("Replacing %s with %s outside of reference section" %
            (origt, unicode(t)))
      if tname.strip() in ["cite-usenet", "quote-usenet"]:
        if tname.strip() == "cite-usenet":
          fixed_params = fix_cite_usenet_params(t)
        else:
          fixed_params = fix_quote_usenet_params(t)
        set_template_name(t, "quote-newsgroup", tname)
        pagemsg("%s -> quote-newsgroup" % tname.strip())
        prefix = getparam(t, "prefix").strip()
        removed_prefix = False
        if prefix:
          if prefix in ["#", "#*"]:
            parsed.insert_before(t, "#* ")
            rmparam(t, "prefix")
            pagemsg("remove prefix=%s, insert #* before template" % prefix)
            need_to_replace_double_quote_prefixes = True
            removed_prefix = True
          else:
            pagemsg("WARNING: Found prefix=%s, not # or #*: %s" %
                (prefix, unicode(t)))
        notes.append("%s -> quote-newsgroup%s%s" % (tname.strip(),
          removed_prefix and
            ", remove prefix=%s, insert #* before template" % prefix or "",
          fixed_params and ", fix params" or ""))
        pagemsg("Replacing %s with %s" % (origt, unicode(t)))
    subsections[j] = unicode(parsed)
    if need_to_replace_double_quote_prefixes:
      newval = re.sub("^#\* #\* ", "#* ", subsections[j], 0, re.M)
      if newval != subsections[j]:
        notes.append("remove double #* prefix")
        pagemsg("Removed double #* prefix")
      subsections[j] = newval
  newtext = "".join(subsections)

  if text != newtext:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, newtext))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)