Beispiel #1
0
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      pagemsg("Found ru-noun+ or ru-proper noun+, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, []

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  frobbed_manual_translit = []
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  # Retrieve headword translit and maybe transfer to decl
  headword_tr = getparam(headword_template, "tr")
  if headword_tr:
    if verbose:
      pagemsg("Found headword manual translit tr=%s" % headword_tr)
    if "," in headword_tr:
      pagemsg("WARNING: Comma in headword manual translit, skipping: %s" %
          headword_tr)
      return None
    # Punt if multi-arg-set, can't handle yet
    for decl_template in decl_templates:
      for param in decl_template.params:
        if not param.showkey:
          val = unicode(param.value)
          if val == "or":
            pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
          if val == "-" or val == "_" or val.startswith("join:"):
            pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
      for i in xrange(2, 10):
        if getparam(headword_template, "tr%s" % i):
          pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % (
            i, unicode(headword_template)))
          return None
      if runoun.arg1_is_stress(getparam(decl_template, "1")):
        lemma_arg = "2"
      else:
        lemma_arg = "1"
      lemmaval = getparam(decl_template, lemma_arg)
      if not lemmaval:
        lemmaval = subpagetitle
      if "//" in lemmaval:
        m = re.search("^(.*?)//(.*)$", lemmaval)
        if m.group(2) != headword_tr:
          pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % (
            lemmaval, headword_tr))
          return None
        else:
          pagemsg("Already found manual translit in decl template %s" %
              lemmaval)
      else:
        lemmaval += "//" + headword_tr
        orig_decl_template = unicode(decl_template)
        decl_template.add(lemma_arg, lemmaval)
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))
        frobbed_manual_translit = [headword_tr]

  genders = blib.fetch_param_chain(headword_template, "2", "g")

  bian_replaced = 0

  # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in
  # headword template
  for decl_template in decl_templates:
    if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]:
      saw_in = -1
      saw_an = -1
      for i,g in enumerate(genders):
        if re.search(r"\bin\b", g) and saw_in < 0:
          saw_in = i
        if re.search(r"\ban\b", g) and saw_an < 0:
          saw_an = i
      if saw_in >= 0 and saw_an >= 0:
        orig_decl_template = unicode(decl_template)
        if saw_in < saw_an:
          pagemsg("Replacing a=bi with a=ia in decl template")
          decl_template.add("a", "ia")
          bian_replaced = 1
        else:
          pagemsg("Replacing a=bi with a=ai in decl template")
          decl_template.add("a", "ai")
          bian_replaced = 1
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))

  generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
      unicode(noun_table_template))
  generate_result = expand_text(generate_template)
  if not generate_result:
    pagemsg("WARNING: Error generating noun args, skipping")
    return None
  args = ru.split_generate_args(generate_result)

  genders = runoun.check_old_noun_headword_forms(headword_template, args,
      subpagetitle, pagemsg)
  if genders == None:
    return None

  new_params = []
  for param in noun_table_template.params:
    new_params.append((param.name, param.value))

  orig_headword_template = unicode(headword_template)
  params_to_preserve = runoun.fix_old_headword_params(headword_template,
      new_params, genders, pagemsg)
  if params_to_preserve == None:
    return None

  if unicode(headword_template.name) == "ru-proper noun":
    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(headword_template, "n"):
      pagemsg("Adding n=both to headword tempate")
      headword_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        else:
          pagemsg("Removing n=sg from headword tempate")
          rmparam(headword_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  headword_template.params.extend(params_to_preserve)
  ru_noun_changed = 0
  ru_proper_noun_changed = 0
  if unicode(headword_template.name) == "ru-noun":
    headword_template.name = "ru-noun+"
    ru_noun_changed = 1
  else:
    headword_template.name = "ru-proper noun+"
    ru_proper_noun_changed = 1

  pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template)))

  return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit
            proposed_decl, "n"):
        proposed_decl.add("n", "sg")

    # This will check number mismatch (and animacy mismatch, but that shouldn't
    # occur as we've taken the animacy directly from the headword)
    new_genders = runounlib.check_old_noun_headword_forms(
        headword_template,
        genargs,
        subpagetitle,
        pagemsg_with_proposed,
        laxer_comparison=True)
    if new_genders == None:
        return None

    orig_headword_template = unicode(headword_template)
    params_to_preserve = runounlib.fix_old_headword_params(
        headword_template, params, new_genders, pagemsg_with_proposed)
    if params_to_preserve == None:
        return None

    headword_template.params.extend(params_to_preserve)

    notes = []
    ru_noun_changed = 0
    ru_proper_noun_changed = 0
    if unicode(headword_template.name) == "ru-noun":
        headword_template.name = "ru-noun+"
        notes.append(
            "convert multi-word ru-noun to ru-noun+ by looking up decls")
    else:
        headword_template.name = "ru-proper noun+"
        notes.append(
  if not generate_result:
    pagemsg_with_proposed("WARNING: Error generating noun args, skipping")
    return
  genargs = ru.split_generate_args(generate_result)
  if headword_is_proper and genargs["n"] == "s" and not getparam(proposed_decl, "n"):
    proposed_decl.add("n", "sg")

  # This will check number mismatch (and animacy mismatch, but that shouldn't
  # occur as we've taken the animacy directly from the headword)
  new_genders = runoun.check_old_noun_headword_forms(headword_template, genargs,
      subpagetitle, pagemsg_with_proposed, laxer_comparison=True)
  if new_genders == None:
    return None

  orig_headword_template = unicode(headword_template)
  params_to_preserve = runoun.fix_old_headword_params(headword_template,
      params, new_genders, pagemsg_with_proposed)
  if params_to_preserve == None:
    return None

  headword_template.params.extend(params_to_preserve)

  notes = []
  ru_noun_changed = 0
  ru_proper_noun_changed = 0
  if unicode(headword_template.name) == "ru-noun":
    headword_template.name = "ru-noun+"
    notes.append("convert multi-word ru-noun to ru-noun+ by looking up decls")
  else:
    headword_template.name = "ru-proper noun+"
    notes.append("convert multi-word ru-proper noun to ru-proper noun+ by looking up decls")
Beispiel #4
0
def process_page_section(index, page, section, verbose):
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    if not page.exists():
        pagemsg("WARNING: Page doesn't exist, skipping")
        return None

    parsed = blib.parse_text(section)

    noun_table_templates = []
    noun_old_templates = []

    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-decl-noun-see":
            pagemsg("Found ru-decl-noun-see, skipping")
            return None

    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-noun-table":
            noun_table_templates.append(t)
        if unicode(t.name) == "ru-noun-old":
            noun_old_templates.append(t)

    if len(noun_table_templates) > 1:
        pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
        return None
    if len(noun_old_templates) > 1:
        pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
        return None
    if not noun_table_templates and not noun_old_templates:
        return unicode(parsed), 0, 0, 0, []

    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
            pagemsg("Found ru-noun+ or ru-proper noun+, skipping")
            return None

    headword_templates = []

    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
            headword_templates.append(t)

    if len(headword_templates) > 1:
        pagemsg("WARNING: Found multiple headword templates, skipping")
        return None
    if len(headword_templates) < 1:
        return unicode(parsed), 0, 0, 0, []

    noun_table_template = noun_table_templates[0] if len(
        noun_table_templates) == 1 else None
    noun_old_template = noun_old_templates[0] if len(
        noun_old_templates) == 1 else None
    if noun_old_template and not noun_table_template:
        noun_table_template = noun_old_template
        noun_old_template = None
    headword_template = headword_templates[0]
    frobbed_manual_translit = []
    decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

    if verbose:
        pagemsg("Found headword template: %s" % unicode(headword_template))
        pagemsg("Found decl template: %s" % unicode(noun_table_template))
        if noun_old_template:
            pagemsg("Found old decl template: %s" % unicode(noun_old_template))

    # Retrieve headword translit and maybe transfer to decl
    headword_tr = getparam(headword_template, "tr")
    if headword_tr:
        if verbose:
            pagemsg("Found headword manual translit tr=%s" % headword_tr)
        if "," in headword_tr:
            pagemsg(
                "WARNING: Comma in headword manual translit, skipping: %s" %
                headword_tr)
            return None
        # Punt if multi-arg-set, can't handle yet
        for decl_template in decl_templates:
            for param in decl_template.params:
                if not param.showkey:
                    val = unicode(param.value)
                    if val == "or":
                        pagemsg(
                            "WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s"
                            % unicode(decl_template))
                        return None
                    if val == "-" or val == "_" or val.startswith("join:"):
                        pagemsg(
                            "WARNING: Manual translit and multi-word templates, can't handle, skipping: %s"
                            % unicode(decl_template))
                        return None
            for i in xrange(2, 10):
                if getparam(headword_template, "tr%s" % i):
                    pagemsg(
                        "WARNING: Headword template has translit param tr%s, can't handle, skipping: %s"
                        % (i, unicode(headword_template)))
                    return None
            if runounlib.arg1_is_stress(getparam(decl_template, "1")):
                lemma_arg = "2"
            else:
                lemma_arg = "1"
            lemmaval = getparam(decl_template, lemma_arg)
            if not lemmaval:
                lemmaval = subpagetitle
            if "//" in lemmaval:
                m = re.search("^(.*?)//(.*)$", lemmaval)
                if m.group(2) != headword_tr:
                    pagemsg(
                        "WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping"
                        % (lemmaval, headword_tr))
                    return None
                else:
                    pagemsg(
                        "Already found manual translit in decl template %s" %
                        lemmaval)
            else:
                lemmaval += "//" + headword_tr
                orig_decl_template = unicode(decl_template)
                decl_template.add(lemma_arg, lemmaval)
                pagemsg("Replacing decl %s with %s" %
                        (orig_decl_template, unicode(decl_template)))
                frobbed_manual_translit = [headword_tr]

    genders = blib.fetch_param_chain(headword_template, "2", "g")

    bian_replaced = 0

    # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in
    # headword template
    for decl_template in decl_templates:
        if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]:
            saw_in = -1
            saw_an = -1
            for i, g in enumerate(genders):
                if re.search(r"\bin\b", g) and saw_in < 0:
                    saw_in = i
                if re.search(r"\ban\b", g) and saw_an < 0:
                    saw_an = i
            if saw_in >= 0 and saw_an >= 0:
                orig_decl_template = unicode(decl_template)
                if saw_in < saw_an:
                    pagemsg("Replacing a=bi with a=ia in decl template")
                    decl_template.add("a", "ia")
                    bian_replaced = 1
                else:
                    pagemsg("Replacing a=bi with a=ai in decl template")
                    decl_template.add("a", "ai")
                    bian_replaced = 1
                pagemsg("Replacing decl %s with %s" %
                        (orig_decl_template, unicode(decl_template)))

    generate_template = re.sub(
        r"^\{\{ru-noun-old", "{{ru-generate-noun-args|old=1",
        re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
               unicode(noun_table_template)))
    generate_result = expand_text(generate_template)
    if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
    args = blib.split_generate_args(generate_result)

    genders = runounlib.check_old_noun_headword_forms(headword_template, args,
                                                      subpagetitle, pagemsg)
    if genders == None:
        return None

    new_params = []
    for param in noun_table_template.params:
        new_params.append((param.name, param.value))

    orig_headword_template = unicode(headword_template)
    params_to_preserve = runounlib.fix_old_headword_params(
        headword_template, new_params, genders, pagemsg)
    if params_to_preserve == None:
        return None

    if unicode(headword_template.name) == "ru-proper noun":
        # If proper noun and n is both then we need to add n=both because
        # proper noun+ defaults to n=sg
        if args["n"] == "b" and not getparam(headword_template, "n"):
            pagemsg("Adding n=both to headword tempate")
            headword_template.add("n", "both")
        # Correspondingly, if n is sg then we can usually remove n=sg;
        # but we need to check that the number is actually sg with n=sg
        # removed because of the possibility of plurale tantum lemmas
        if args["n"] == "s":
            generate_template_with_ndef = generate_template.replace(
                "}}", "|ndef=sg}}")
            generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
                                                 generate_template_with_ndef)
            generate_result = expand_text(generate_template_with_ndef)
            if not generate_result:
                pagemsg("WARNING: Error generating noun args, skipping")
                return None
            ndef_args = blib.split_generate_args(generate_result)
            if ndef_args["n"] == "s":
                existing_n = getparam(headword_template, "n")
                if existing_n and not re.search(r"^s", existing_n):
                    pagemsg(
                        "WARNING: Something wrong: Found n=%s, not singular" %
                        existing_n)
                else:
                    pagemsg("Removing n=sg from headword tempate")
                    rmparam(headword_template, "n")
            else:
                pagemsg(
                    "WARNING: Unable to remove n= from headword template because n=%s"
                    % ndef_args["n"])

    headword_template.params.extend(params_to_preserve)
    ru_noun_changed = 0
    ru_proper_noun_changed = 0
    if unicode(headword_template.name) == "ru-noun":
        headword_template.name = "ru-noun+"
        ru_noun_changed = 1
    else:
        headword_template.name = "ru-proper noun+"
        ru_proper_noun_changed = 1
    if unicode(noun_table_template).startswith("{{ru-noun-old"):
        headword_template.add("old", "1")

    pagemsg("Replacing headword %s with %s" %
            (orig_headword_template, unicode(headword_template)))

    return unicode(
        parsed
    ), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit