Beispiel #1
0
def do_canon_param(pagetitle,
                   index,
                   template,
                   fromparam,
                   toparam,
                   paramtr,
                   arabic,
                   latin,
                   include_tempname_in_changelog=False):
    actions = []
    tname = unicode(template.name)

    def pagemsg(text):
        msg("Page %s %s: %s.%s: %s" %
            (index, pagetitle, tname, fromparam, text))

    if show_template:
        pagemsg("Processing %s" % (unicode(template)))

    if include_tempname_in_changelog:
        paramtrname = "%s.%s" % (tname, paramtr)
    else:
        paramtrname = paramtr

    if latin == "-":
        pagemsg("Latin is -, taking no action")
        return False, False, []

    # Compute canonarabic and canonlatin
    match_canon = False
    canonlatin = ""
    if latin:
        try:
            canonarabic, canonlatin = ar_translit.tr_matching(arabic,
                                                              latin,
                                                              True,
                                                              msgfun=pagemsg)
            match_canon = True
        except Exception as e:
            pagemsg("NOTE: Unable to vocalize %s (%s): %s: %s" %
                    (arabic, latin, e, unicode(template)))
            canonlatin, canonarabic = ar_translit.canonicalize_latin_arabic(
                latin, arabic, msgfun=pagemsg)
    else:
        _, canonarabic = ar_translit.canonicalize_latin_arabic(None,
                                                               arabic,
                                                               msgfun=pagemsg)

    newlatin = canonlatin == latin and "same" or canonlatin
    newarabic = canonarabic == arabic and "same" or canonarabic

    latintrtext = (latin
                   or canonlatin) and " (%s -> %s)" % (latin, newlatin) or ""

    try:
        translit = ar_translit.tr(canonarabic, msgfun=pagemsg)
        if not translit:
            pagemsg("NOTE: Unable to auto-translit %s (canoned from %s): %s" %
                    (canonarabic, arabic, unicode(template)))
    except Exception as e:
        pagemsg("NOTE: Unable to transliterate %s (canoned from %s): %s: %s" %
                (canonarabic, arabic, e, unicode(template)))
        translit = None

    show_diff_string = False
    if canonarabic == arabic:
        pagemsg("No change in Arabic %s%s" % (arabic, latintrtext))
        canonarabic = False
    else:
        if match_canon:
            operation = "Vocalizing"
            actionop = "vocalize"
        elif latin:
            operation = "Cross-canoning"
            actionop = "cross-canon"
            show_diff_string = True
        else:
            operation = "Self-canoning"
            actionop = "self-canon"
            show_diff_string = True
        if show_diff_string:
            diffmsg = " (%s)" % diff_string(arabic, canonarabic)
        else:
            diffmsg = ""
        pagemsg("%s Arabic %s -> %s%s%s: %s" %
                (operation, arabic, canonarabic, latintrtext, diffmsg,
                 unicode(template)))
        if fromparam == toparam:
            actions.append("%s %s=%s -> %s" %
                           (actionop, fromparam, arabic, canonarabic))
        else:
            actions.append("%s %s=%s -> %s=%s" %
                           (actionop, fromparam, arabic, toparam, canonarabic))
        rdcanonarabic = ar_translit.remove_diacritics(canonarabic)
        rdarabic = ar_translit.remove_diacritics(arabic)
        if rdarabic != rdcanonarabic:
            msgs = []
            if "  " in rdarabic or rdarabic.startswith(
                    " ") or rdarabic.endswith(" "):
                msgs.append("stray space")
            if re.search("[A-Za-z]", nfd_form(rdarabic)):
                msgs.append("Latin")
            if u"\u00A0" in rdarabic:
                msgs.append("NBSP")
            if re.search(u"[\u200E\u200F]", rdarabic):
                msgs.append("L2R/R2L")
            if u"ی" in rdarabic:
                msgs.append("Farsi Yeh")
            if u"ک" in rdarabic:
                msgs.append("Keheh")
            if re.search(u"[\uFB50-\uFDCF]", rdarabic):
                msgs.append("Arabic Pres-A")
            if re.search(u"[\uFDF0-\uFDFF]", rdarabic):
                msgs.append("Arabic word ligatures")
            if re.search(u"[\uFE70-\uFEFF]", rdarabic):
                msgs.append("Arabic Pres-B")
            diffmsg = diff_string(rdarabic, rdcanonarabic)

            pagemsg(
                "NOTE: Without diacritics, old Arabic %s different from canon %s%s (%s): %s"
                % (arabic, canonarabic,
                   msgs and " (in old: %s)" % ", ".join(msgs)
                   or "", diffmsg, unicode(template)))

    if not latin:
        pass
    elif translit and (
            translit == canonlatin
            # or translit == canonlatin + "un" or
            #    translit == u"ʾ" + canonlatin or
            #    translit == u"ʾ" + canonlatin + "un"
    ):
        pagemsg("Removing redundant translit for %s -> %s%s" %
                (arabic, newarabic, latintrtext))
        actions.append("remove redundant %s=%s" % (paramtrname, latin))
        canonlatin = True
    else:
        if match_canon:
            operation = "Match-canoning"
            passive = "Match-canoned"
            actionop = "match-canon"
        else:
            operation = "Cross-canoning"
            passive = "Cross-canoned"
            actionop = "cross-canon"
        if translit:
            pagemsg(
                "NOTE: %s Latin %s not same as auto-translit %s, can't remove: %s"
                % (passive, canonlatin, translit, unicode(template)))
        if canonlatin == latin:
            pagemsg(
                "No change in Latin %s: Arabic %s -> %s (auto-translit %s)" %
                (latin, arabic, newarabic, translit))
            canonlatin = False
        else:
            pagemsg(
                "%s Latin %s -> %s: Arabic %s -> %s (auto-translit %s): %s" %
                (operation, latin, canonlatin, arabic, newarabic, translit,
                 unicode(template)))
            actions.append("%s %s=%s -> %s" %
                           (actionop, paramtrname, latin, canonlatin))

    return (canonarabic, canonlatin, actions)
Beispiel #2
0
def process_param(pagetitle, index, template, param, paramtr,
    include_tempname_in_changelog=False):
  def pagemsg(text):
    msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param,
      text))
  arabic = getparam(template, param)
  latin = getparam(template, paramtr)
  if include_tempname_in_changelog:
    paramtrname = "%s.%s" % (template.name, paramtr)
  else:
    paramtrname = paramtr
  if not arabic:
    return False
  if latin == "-":
    pagemsg("Translit is '-', skipping")
    return True
  if latin:
    try:
      _, canonlatin = tr_matching(arabic, latin, True, pagemsg)
      if not canonlatin:
        pagemsg("Unable to match-canonicalize %s (%s)" % (arabic, latin))
    except Exception as e:
      pagemsg("Trying to match-canonicalize %s (%s): %s" % (arabic, latin, e))
      canonlatin = None
    try:
      translit = ar_translit.tr(arabic)
      if not translit:
        pagemsg("Unable to auto-translit %s" % arabic)
    except Exception as e:
      pagemsg("Trying to transliterate %s: %s" % (arabic, e))
      translit = None
    if translit and canonlatin:
      if translit == canonlatin:
      #if (translit == canonlatin or
      #    translit == canonlatin + "un" or
      #    translit == u"ʾ" + canonlatin or
      #    translit == u"ʾ" + canonlatin + "un"):
        pagemsg("Removing redundant translit for %s (%s)" % (arabic, latin))
        oldtempl = "%s" % unicode(template)
        template.remove(paramtr)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["remove redundant %s=%s" % (paramtrname, latin)]
      else:
        pagemsg("Auto-translit for %s (%s) not same as manual translit %s (canonicalized %s)" %
            (arabic, translit, latin, canonlatin))
    if canonlatin:
      if latin != canonlatin:
        pagemsg("Match-canonicalizing Latin %s to %s" % (latin, canonlatin))
        oldtempl = "%s" % unicode(template)
        addparam(template, paramtr, canonlatin)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["match-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
      return True
    canonlatin, _ = ar_translit.canonicalize_latin_arabic(latin, None)
    if latin != canonlatin:
      pagemsg("Self-canonicalizing Latin %s to %s" % (latin, canonlatin))
      oldtempl = "%s" % unicode(template)
      addparam(template, paramtr, canonlatin)
      msg("Page %s %s: Replaced %s with %s" %
          (index, pagetitle, oldtempl, unicode(template)))
      return ["self-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
  return True
def process_param(pagetitle, index, template, param, paramtr,
    include_tempname_in_changelog=False):
  def pagemsg(text):
    msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param,
      text))
  arabic = getparam(template, param)
  latin = getparam(template, paramtr)
  if include_tempname_in_changelog:
    paramtrname = "%s.%s" % (template.name, paramtr)
  else:
    paramtrname = paramtr
  if not arabic:
    return False
  if latin == "-":
    pagemsg("Translit is '-', skipping")
    return True
  if latin:
    try:
      _, canonlatin = tr_matching(arabic, latin, True, pagemsg)
      if not canonlatin:
        pagemsg("Unable to match-canonicalize %s (%s)" % (arabic, latin))
    except Exception as e:
      pagemsg("Trying to match-canonicalize %s (%s): %s" % (arabic, latin, e))
      canonlatin = None
    try:
      translit = ar_translit.tr(arabic)
      if not translit:
        pagemsg("Unable to auto-translit %s" % arabic)
    except Exception as e:
      pagemsg("Trying to transliterate %s: %s" % (arabic, e))
      translit = None
    if translit and canonlatin:
      if translit == canonlatin:
      #if (translit == canonlatin or
      #    translit == canonlatin + "un" or
      #    translit == u"ʾ" + canonlatin or
      #    translit == u"ʾ" + canonlatin + "un"):
        pagemsg("Removing redundant translit for %s (%s)" % (arabic, latin))
        oldtempl = "%s" % unicode(template)
        template.remove(paramtr)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["remove redundant %s=%s" % (paramtrname, latin)]
      else:
        pagemsg("Auto-translit for %s (%s) not same as manual translit %s (canonicalized %s)" %
            (arabic, translit, latin, canonlatin))
    if canonlatin:
      if latin != canonlatin:
        pagemsg("Match-canonicalizing Latin %s to %s" % (latin, canonlatin))
        oldtempl = "%s" % unicode(template)
        addparam(template, paramtr, canonlatin)
        msg("Page %s %s: Replaced %s with %s" %
            (index, pagetitle, oldtempl, unicode(template)))
        return ["match-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
      return True
    canonlatin, _ = ar_translit.canonicalize_latin_arabic(latin, None)
    if latin != canonlatin:
      pagemsg("Self-canonicalizing Latin %s to %s" % (latin, canonlatin))
      oldtempl = "%s" % unicode(template)
      addparam(template, paramtr, canonlatin)
      msg("Page %s %s: Replaced %s with %s" %
          (index, pagetitle, oldtempl, unicode(template)))
      return ["self-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)]
  return True
Beispiel #4
0
def do_canon_param(pagetitle, index, template, fromparam, toparam, paramtr,
    arabic, latin, include_tempname_in_changelog=False):
  actions = []
  tname = unicode(template.name)
  def pagemsg(text):
    msg("Page %s %s: %s.%s: %s" % (index, pagetitle, tname, fromparam, text))

  if show_template:
    pagemsg("Processing %s" % (unicode(template)))

  if include_tempname_in_changelog:
    paramtrname = "%s.%s" % (tname, paramtr)
  else:
    paramtrname = paramtr

  if latin == "-":
    pagemsg("Latin is -, taking no action")
    return False, False, []

  # Compute canonarabic and canonlatin
  match_canon = False
  canonlatin = ""
  if latin:
    try:
      canonarabic, canonlatin = ar_translit.tr_matching(arabic, latin, True,
          msgfun=pagemsg)
      match_canon = True
    except Exception as e:
      pagemsg("NOTE: Unable to vocalize %s (%s): %s: %s" % (arabic, latin, e, unicode(template)))
      canonlatin, canonarabic = ar_translit.canonicalize_latin_arabic(latin,
          arabic, msgfun=pagemsg)
  else:
    _, canonarabic = ar_translit.canonicalize_latin_arabic(None, arabic,
        msgfun=pagemsg)

  newlatin = canonlatin == latin and "same" or canonlatin
  newarabic = canonarabic == arabic and "same" or canonarabic

  latintrtext = (latin or canonlatin) and " (%s -> %s)" % (latin, newlatin) or ""

  try:
    translit = ar_translit.tr(canonarabic, msgfun=pagemsg)
    if not translit:
      pagemsg("NOTE: Unable to auto-translit %s (canoned from %s): %s" %
          (canonarabic, arabic, unicode(template)))
  except Exception as e:
    pagemsg("NOTE: Unable to transliterate %s (canoned from %s): %s: %s" %
        (canonarabic, arabic, e, unicode(template)))
    translit = None

  show_diff_string = False
  if canonarabic == arabic:
    pagemsg("No change in Arabic %s%s" % (arabic, latintrtext))
    canonarabic = False
  else:
    if match_canon:
      operation="Vocalizing"
      actionop="vocalize"
    elif latin:
      operation="Cross-canoning"
      actionop="cross-canon"
      show_diff_string = True
    else:
      operation="Self-canoning"
      actionop="self-canon"
      show_diff_string = True
    if show_diff_string:
      diffmsg = " (%s)" % diff_string(arabic, canonarabic)
    else:
      diffmsg = ""
    pagemsg("%s Arabic %s -> %s%s%s: %s" % (operation, arabic, canonarabic,
      latintrtext, diffmsg, unicode(template)))
    if fromparam == toparam:
      actions.append("%s %s=%s -> %s" % (actionop, fromparam, arabic,
        canonarabic))
    else:
      actions.append("%s %s=%s -> %s=%s" % (actionop, fromparam, arabic,
        toparam, canonarabic))
    rdcanonarabic = ar_translit.remove_diacritics(canonarabic)
    rdarabic = ar_translit.remove_diacritics(arabic)
    if rdarabic != rdcanonarabic:
      msgs = []
      if "  " in rdarabic or rdarabic.startswith(" ") or rdarabic.endswith(" "):
        msgs.append("stray space")
      if re.search("[A-Za-z]", nfd_form(rdarabic)):
        msgs.append("Latin")
      if u"\u00A0" in rdarabic:
        msgs.append("NBSP")
      if re.search(u"[\u200E\u200F]", rdarabic):
        msgs.append("L2R/R2L")
      if u"ی" in rdarabic:
        msgs.append("Farsi Yeh")
      if u"ک" in rdarabic:
        msgs.append("Keheh")
      if re.search(u"[\uFB50-\uFDCF]", rdarabic):
        msgs.append("Arabic Pres-A")
      if re.search(u"[\uFDF0-\uFDFF]", rdarabic):
        msgs.append("Arabic word ligatures")
      if re.search(u"[\uFE70-\uFEFF]", rdarabic):
        msgs.append("Arabic Pres-B")
      diffmsg = diff_string(rdarabic, rdcanonarabic)

      pagemsg("NOTE: Without diacritics, old Arabic %s different from canon %s%s (%s): %s"
          % (arabic, canonarabic, msgs and " (in old: %s)" % ", ".join(msgs) or "",
            diffmsg, unicode(template)))

  if not latin:
    pass
  elif translit and (translit == canonlatin
      # or translit == canonlatin + "un" or
      #    translit == u"ʾ" + canonlatin or
      #    translit == u"ʾ" + canonlatin + "un"
      ):
    pagemsg("Removing redundant translit for %s -> %s%s" % (
        arabic, newarabic, latintrtext))
    actions.append("remove redundant %s=%s" % (paramtrname, latin))
    canonlatin = True
  else:
    if match_canon:
      operation="Match-canoning"
      passive="Match-canoned"
      actionop="match-canon"
    else:
      operation="Cross-canoning"
      passive="Cross-canoned"
      actionop="cross-canon"
    if translit:
      pagemsg("NOTE: %s Latin %s not same as auto-translit %s, can't remove: %s" %
          (passive, canonlatin, translit, unicode(template)))
    if canonlatin == latin:
      pagemsg("No change in Latin %s: Arabic %s -> %s (auto-translit %s)" %
          (latin, arabic, newarabic, translit))
      canonlatin = False
    else:
      pagemsg("%s Latin %s -> %s: Arabic %s -> %s (auto-translit %s): %s" % (
          operation, latin, canonlatin, arabic, newarabic, translit,
          unicode(template)))
      actions.append("%s %s=%s -> %s" % (actionop, paramtrname, latin,
        canonlatin))

  return (canonarabic, canonlatin, actions)