def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if blib.page_should_be_ignored(pagetitle):
        pagemsg("Skipping ignored page")
        return None, None

    def hack_templates(parsed, langname):
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn in request_templates:
                if getparam(t, "lang"):
                    continue
                if langname and langname != "English":
                    pagemsg(
                        "WARNING: Would default to English but in %s section, skipping: %s"
                        % (langname, origt))
                    continue
                notes.append("add lang=en for {{%s}} with missing lang code" %
                             tn)
                rmparam(t, "lang")  # in case it's blank
                # Fetch all params.
                params = []
                for param in t.params:
                    pname = unicode(param.name)
                    params.append((pname, param.value, param.showkey))
                # Erase all params.
                del t.params[:]
                newline = "\n" if "\n" in unicode(t.name) else ""
                t.add("lang", "en" + newline, preserve_spacing=False)
                # Put remaining parameters in order.
                for name, value, showkey in params:
                    t.add(name, value, showkey=showkey, preserve_spacing=False)
                pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    pagemsg("Processing")

    text = unicode(page.text)
    notes = []

    sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

    for j in xrange(0, len(sections), 2):
        if j == 0:
            langname = None
        else:
            m = re.search("^==(.*)==\n$", sections[j - 1])
            assert m
            langname = m.group(1)
        parsed = blib.parse_text(sections[j])
        hack_templates(parsed, langname)
        sections[j] = unicode(parsed)

    newtext = "".join(sections)
    return newtext, notes
Ejemplo n.º 2
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    text = unicode(page.text)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")
    notes = []

    if blib.page_should_be_ignored(pagetitle):
        pagemsg("WARNING: Page should be ignored")
        return None, None

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        if tn in templates:
            infl_params = templates[tn]
            lang = getparam(t, "lang")
            if lang:
                has_lang = True
                term = getparam(t, "1")
                alt = getparam(t, "2")
                gloss = getparam(t, "3")
            else:
                has_lang = False
                lang = getparam(t, "1")
                term = getparam(t, "2")
                alt = getparam(t, "3")
                gloss = getparam(t, "4")
            params = []
            for param in t.params:
                pname = unicode(param.name).strip()
                pval = unicode(param.value).strip()
                if pname in ["lang", "1", "2", "3"] or (pname == "4"
                                                        and not has_lang):
                    continue
                pagemsg("WARNING: Unrecognized param %s, skipping" % pname)
                return None, None
            # Erase all params.
            del t.params[:]
            # Put back new params.
            blib.set_template_name(t, "inflection of")
            t.add("1", lang)
            t.add("2", term)
            t.add("3", alt)
            for index, tag in enumerate(infl_params):
                t.add(str(index + 4), tag)
            if gloss:
                t.add("t", gloss)
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append("replace {{%s}} with {{inflection of}}" % tn)

    return unicode(parsed), notes
Ejemplo n.º 3
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    text = unicode(page.text)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")
    notes = []

    if blib.page_should_be_ignored(pagetitle):
        pagemsg("WARNING: Page should be ignored")
        return None, None

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        if tn == "doublet":
            params = []
            for param in t.params:
                pname = unicode(param.name).strip()
                pval = unicode(param.value).strip()
                showkey = param.showkey
                if not pval:
                    continue
                if pname == "3":
                    pname = "alt1"
                    showkey = True
                elif pname == "4":
                    pname = "t1"
                    showkey = True
                elif pname in [
                        "t", "gloss", "tr", "ts", "pos", "lit", "alt", "sc",
                        "id", "g"
                ]:
                    pname = pname + "1"
                elif pname in ["1", "2", "notext", "nocap", "nocat"]:
                    pass
                else:
                    pagemsg(
                        "WARNING: Unrecognized param %s=%s in %s, skipping" %
                        (pname, pval, origt))
                    break
                params.append((pname, pval, showkey))
            else:  # No break
                # Erase all params.
                del t.params[:]
                # Put back new params.
                for pname, pval, showkey in params:
                    t.add(pname, pval, showkey=showkey, preserve_spacing=False)
                if origt != unicode(t):
                    pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                    notes.append("restructure {{doublet}} for new syntax")

    return unicode(parsed), notes
Ejemplo n.º 4
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    #pagemsg("Processing")

    if blib.page_should_be_ignored(pagetitle):
        #pagemsg("WARNING: Page should be ignored")
        return

    sections = re.split("(^==[^=\n]+==\n)", text, 0, re.M)
    langs = []
    for j in xrange(1, len(sections), 2):
        m = re.search("^==(.*)==$", sections[j])
        langs.append(m.group(1))
    pagemsg("Languages = %s" % ",".join(langs))
Ejemplo n.º 5
0
def process_text_on_page(pagetitle, index, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if blib.page_should_be_ignored(pagetitle):
    pagemsg("WARNING: Page should be ignored")
    return None, None

  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn == "form of":
      lang = getparam(t, "lang")
      if lang:
        form = getparam(t, "1")
      else:
        form = getparam(t, "2")
      form_of_forms[form] += 1
Ejemplo n.º 6
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    #pagemsg("Processing")

    if blib.page_should_be_ignored(pagetitle):
        #pagemsg("WARNING: Page should be ignored")
        return

    if all(x not in text for x in inflection_of_templates):
        return

    subsections = re.split("(^==+[^=\n]+==+\n)", text, 0, re.M)
    for j in xrange(2, len(subsections), 2):
        for template in inflection_of_templates:
            if re.search(
                    r"^[#*]+ \{\{%s.*\n[#*]+ \{\{%s.*" % (template, template),
                    subsections[j], re.M):
                pagemsg("Found subsection with combinable %s:\n%s" %
                        (template, subsections[j].strip()))
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    #pagemsg("Processing")

    if blib.page_should_be_ignored(pagetitle):
        #pagemsg("WARNING: Page should be ignored")
        return

    if "inflection of" not in text:
        return

    parsed = blib.parse_text(text)

    templates_to_replace = []

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        if tn in ["inflection of"]:
            if getparam(t, "lang"):
                term_param = 1
            else:
                term_param = 2
            for param in t.params:
                pname = unicode(param.name).strip()
                pval = unicode(param.value).strip()
                if re.search("^[0-9]+$", pname):
                    if int(pname) >= term_param + 2:
                        if pval in ["and", "or", ";", ";<!--\n-->"
                                    ] or "/" in pval or "," in pval:
                            pagemsg("Found template: %s" % origt)
                            break

    return
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  text = unicode(page.text)

  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")
  notes = []

  if blib.page_should_be_ignored(pagetitle):
    pagemsg("WARNING: Page should be ignored")
    return None, None

  def combine_doublets(m):
    first = blib.parse_text(m.group(1))
    rest = blib.parse_text(m.group(2))
    t1 = list(first.filter_templates())[0]
    if getparam(t1, "3") or getparam(t1, "4") or getparam(t1, "alt2") or getparam(t1, "alt3"):
      pagemsg("WARNING: Can't combine %s, first template already has multiple terms" %
          m.group(0))
      return m.group(0)
    next_index = 2
    lang = getparam(t1, "1")
    for t in rest.filter_templates(recursive=False):
      tlang = getparam(t, "1")
      if lang != tlang:
        pagemsg("WARNING: Lang %s in continuation template %s not same as lang %s in first template %s" % (
          tlang, unicode(t), lang, unicode(t1)))
        return m.group(0)
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        if not pval:
          continue
        if pname == "2":
          t1.add(str(next_index + 1), pval)
        elif pname == "3":
          t1.add("alt%s" % next_index, pval)
        elif pname == "4":
          t1.add("t%s" % next_index, pval)
        elif pname in ["t", "gloss", "tr", "ts", "pos", "lit", "alt", "sc",
            "id", "g"]:
          t1.add("%s%s" % (pname, next_index), pval)
        elif pname in ["t1", "gloss1", "tr1", "ts1", "pos1", "lit1", "alt1", "sc1",
            "id1", "g1"]:
          t1.add("%s%s" % (pname[:-1], next_index), pval)
        elif pname in ["1", "notext", "nocap", "nocat"]:
          pass
        else:
          pagemsg("WARNING: Unrecognized param %s=%s in %s, skipping" %
              (pname, pval, unicode(t)))
          return m.group(0)
      next_index += 1
    for param in ["notext", "nocap", "nocat"]:
      val = getparam(t1, param)
      rmparam(t1, param)
      if val:
        t1.add(param, val)
    newtext = unicode(t1)
    pagemsg("Replaced %s with %s" % (m.group(0), newtext))
    return newtext

  newtext = re.sub(r"(\{\{doublet\|(?:[^{}\n]|\{\{[^{}\n]*\}\})*\}\})((?:(?:, *|,? *and *)\{\{(?:m|l|doublet)\|(?:[^{}\n]|\{\{[^{}\n]*\}\})*\}\})+)", combine_doublets, text)
  if newtext != text:
    notes.append("combine adjacent doublets")
  text = newtext

  return text, notes
Ejemplo n.º 9
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  if blib.page_should_be_ignored(pagetitle):
    pagemsg("Skipping ignored page")
    return None, None
      
  def hack_templates(parsed, langname, subsectitle, langnamecode=None,
      is_citation=False):
    if langname not in blib.languages_byCanonicalName:
      if not is_citation:
        langnamecode = None
    else:
      langnamecode = blib.languages_byCanonicalName[langname]["code"]

    for t in parsed.filter_templates():
      origt = unicode(t)
      tn = tname(t)
      if tn in ["citation", "citations"] and is_citation:
        langnamecode = getparam(t, "lang")
      elif tn in quote_templates:
        if getparam(t, "lang"):
          continue
        lang = getparam(t, "language")
        if lang:
          notes.append("Convert language=%s to lang=%s in %s" % (lang, lang, tn))
        else:
          if subsectitle.startswith("Etymology") or subsectitle.startswith("Pronunciation"):
            pagemsg("WARNING: Found template in %s section for language %s, might be different language, skipping: %s" % (
              subsectitle, langname, origt))
            continue
          if not langnamecode:
            pagemsg("WARNING: Unrecognized language %s, unable to add language to %s" % (langname, tn))
            continue
          if langnamecode == "en" and (getparam(t, "translation") or getparam(t, "t")):
            pagemsg("WARNING: Translation section in putative English quote, skipping: %s" % origt)
            continue
          if langnamecode == "mul":
            notes.append("infer lang=en for %s in Translingual section and add termlang=mul" % tn)
          else:
            notes.append("infer lang=%s for %s based on section it's in" % (langnamecode, tn))
        rmparam(t, "language")
        # Fetch all params.
        params = []
        for param in t.params:
          pname = unicode(param.name)
          params.append((pname, param.value, param.showkey))
        # Erase all params.
        del t.params[:]
        if langnamecode == "mul":
          termlang = langnamecode
          langnamecode = "en"
        else:
          termlang = None
        # Put lang parameter.
        newline = "\n" if "\n" in unicode(t.name) else ""
        t.add("lang", langnamecode + newline, preserve_spacing=False)
        if termlang:
          t.add("termlang", termlang + newline, preserve_spacing=False)
        # Put remaining parameters in order.
        for name, value, showkey in params:
          t.add(name, value, showkey=showkey, preserve_spacing=False)
        pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return langnamecode

  pagemsg("Processing")

  text = unicode(page.text)
  notes = []

  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  if not pagetitle.startswith("Citations"):
    for j in xrange(2, len(sections), 2):
      m = re.search("^==(.*)==\n$", sections[j - 1])
      assert m
      langname = m.group(1)
      subsections = re.split("(^==.*==\n)", sections[j], 0, re.M)
      for k in xrange(2, len(subsections), 2):
        m = re.search("^===*(.*?)=*==\n$", subsections[k - 1])
        assert m
        subsectitle = m.group(1)
        parsed = blib.parse_text(subsections[k])
        hack_templates(parsed, langname, subsectitle)
        subsections[k] = unicode(parsed)
      sections[j] = "".join(subsections)
  else:
    # Citation section?
    langnamecode = None
    for j in xrange(0, len(sections), 2):
      if j == 0:
        langname = "Unknown"
      else:
        m = re.search("^==(.*)==\n$", sections[j - 1])
        assert m
        langname = m.group(1)
      parsed = blib.parse_text(sections[j])
      langnamecode = hack_templates(parsed, langname, "Unknown",
          langnamecode=langnamecode, is_citation=True)
      sections[j] = unicode(parsed)

  newtext = "".join(sections)
  return newtext, notes
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if blib.page_should_be_ignored(pagetitle):
        pagemsg("Skipping ignored page")
        return None, ""

    def hack_templates(parsed, subsectitle):
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn in quote_templates:
                if not getparam(t, "nocat"):
                    continue
                if getparam(t, "lang").strip() != "en":
                    continue
                notes.append(
                    "convert nocat=1 in lang=en Translingual section to termlang=mul"
                )
                # Fetch all params.
                params = []
                for param in t.params:
                    pname = unicode(param.name)
                    if pname.strip() != "nocat":
                        params.append((pname, param.value, param.showkey))
                # Erase all params.
                del t.params[:]
                # Put lang and termlang parameters.
                newline = "\n" if "\n" in unicode(t.name) else ""
                t.add("lang", "en" + newline, preserve_spacing=False)
                t.add("termlang", "mul" + newline, preserve_spacing=False)
                # Put remaining parameters in order.
                for name, value, showkey in params:
                    t.add(name, value, showkey=showkey, preserve_spacing=False)
                pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    pagemsg("Processing")

    text = unicode(page.text)
    notes = []

    sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

    for j in xrange(2, len(sections), 2):
        m = re.search("^==(.*)==\n$", sections[j - 1])
        assert m
        langname = m.group(1)
        if langname != "Translingual":
            continue
        subsections = re.split("(^==.*==\n)", sections[j], 0, re.M)
        for k in xrange(2, len(subsections), 2):
            m = re.search("^===*(.*?)=*==\n$", subsections[k - 1])
            assert m
            subsectitle = m.group(1)
            parsed = blib.parse_text(subsections[k])
            hack_templates(parsed, subsectitle)
            subsections[k] = unicode(parsed)
        sections[j] = "".join(subsections)

    newtext = "".join(sections)
    return newtext, notes
Ejemplo n.º 11
0
def process_page(page, index, parsed, lang_in_1):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if blib.page_should_be_ignored(pagetitle):
        pagemsg("Skipping ignored page")
        return None, None

    langparam = "1" if lang_in_1 else "lang"

    def hack_templates(parsed, langname, langnamecode=None, is_citation=False):
        if langname not in blib.languages_byCanonicalName:
            if not is_citation:
                langnamecode = None
        else:
            langnamecode = blib.languages_byCanonicalName[langname]["code"]

        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn in ["citation", "citations"] and is_citation:
                langnamecode = getparam(t, "lang") or getparam(t, "1")
            if tn in templates_to_process:
                if getparam(t, langparam):
                    pass
                elif not langnamecode:
                    pagemsg(
                        "WARNING: Unrecognized language %s, unable to add language to %s"
                        % (langname, origt))
                else:
                    notes.append(
                        "infer %s=%s for {{%s}} based on section it's in" %
                        (langparam, langnamecode, tn))
                    newline = "\n" if "\n" in unicode(t.name) else ""
                    if langparam == "1":
                        if t.has("lang"):
                            pagemsg(
                                "WARNING: Template has lang=, removing: %s" %
                                origt)
                            notes.append("remove lang= from {{%s}}" % tn)
                            rmparam(t, "lang")
                        t.add(langparam,
                              langnamecode + newline,
                              preserve_spacing=False)
                    else:
                        # Fetch all params.
                        params = []
                        for param in t.params:
                            pname = unicode(param.name)
                            params.append((pname, param.value, param.showkey))
                        # Erase all params.
                        del t.params[:]
                        t.add(langparam,
                              langnamecode + newline,
                              preserve_spacing=False)
                        # Put remaining parameters in order.
                        for name, value, showkey in params:
                            t.add(name,
                                  value,
                                  showkey=showkey,
                                  preserve_spacing=False)
            if tn in templates_to_rename:
                blib.set_template_name(t, templates_to_rename[tn])
                notes.append("rename {{%s}} to {{%s}}" %
                             (tn, templates_to_rename[tn]))
            newt = unicode(t)
            if newt != origt:
                pagemsg("Replaced <%s> with <%s>" % (origt, newt))

        return langnamecode

    pagemsg("Processing")

    text = unicode(page.text)
    notes = []

    sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

    if not pagetitle.startswith("Citations"):
        for j in xrange(2, len(sections), 2):
            m = re.search("^==(.*)==\n$", sections[j - 1])
            assert m
            langname = m.group(1)
            parsed = blib.parse_text(sections[j])
            hack_templates(parsed, langname)
            sections[j] = unicode(parsed)
    else:
        # Citation section?
        langnamecode = None
        for j in xrange(0, len(sections), 2):
            if j == 0:
                langname = "Unknown"
            else:
                m = re.search("^==(.*)==\n$", sections[j - 1])
                assert m
                langname = m.group(1)
            parsed = blib.parse_text(sections[j])
            langnamecode = hack_templates(parsed,
                                          langname,
                                          langnamecode=langnamecode,
                                          is_citation=True)
            sections[j] = unicode(parsed)

    newtext = "".join(sections)
    return newtext, notes
Ejemplo n.º 12
0
def process_text_on_page(pagetitle, index, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")
    notes = []

    if blib.page_should_be_ignored(pagetitle):
        pagemsg("WARNING: Page should be ignored")
        return None, None

    parsed = blib.parse_text(text)

    templates_to_replace = []

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        if tn == "inflection of":

            params = []
            if getparam(t, "lang"):
                lang = getparam(t, "lang")
                term_param = 1
                notes.append("moved lang= in {{%s}} to 1=" % tn)
            else:
                lang = getparam(t, "1")
                term_param = 2
            tr = getparam(t, "tr")
            term = getparam(t, str(term_param))
            alt = getparam(t, "alt") or getparam(t, str(term_param + 1))
            tags = []
            for param in t.params:
                pname = unicode(param.name).strip()
                pval = unicode(param.value).strip()
                if re.search("^[0-9]+$", pname):
                    if int(pname) >= term_param + 2:
                        if pval:
                            tags.append(pval)
                        else:
                            notes.append("removed empty tags from {{%s}}" % tn)
                elif pname not in ["lang", "tr", "alt"]:
                    params.append((pname, pval, param.showkey))

            if lang == "pl":
                newtags = ["nv" if tag == "other" else tag for tag in tags]
                if tags != newtags:
                    notes.append(
                        "replaced 'other' with 'nv' in Polish {{%s}}" % tn)
                tags = newtags

            # Erase all params.
            del t.params[:]
            # Put back new params.
            # Strip comment continuations and line breaks. Such cases generally have linebreaks after semicolons
            # as well, but we remove those. (FIXME, consider preserving them.)
            t.add("1", remove_comment_continuations(lang))
            t.add("2", remove_comment_continuations(term))
            tr = remove_comment_continuations(tr)
            if tr:
                t.add("tr", tr)
            t.add("3", remove_comment_continuations(alt))
            next_tag_param = 4

            # Put back the tags into the template and note stats on bad tags
            for tag in tags:
                t.add(str(next_tag_param), tag)
                next_tag_param += 1
            for pname, pval, showkey in params:
                t.add(pname, pval, showkey=showkey, preserve_spacing=False)
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes