def _localized_template(self, template, lang="English"): assert (canonicalize(template) in self._alltemplates) localized = format_title(template, lang) if canonicalize(localized) in self._alltemplates: return localized # fall back to English return template
def _localized_template(self, template, lang="English"): assert(canonicalize(template) in self._templates_list) localized = "{} ({})".format(template, lang) if lang != "English" else template if canonicalize(localized) in self._templates_list: return localized # fall back to English return template
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing page [[{}]]...".format(title)) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue # strip whitespace around the parameter, otherwise it is added to # the link and rendered incorrectly self.strip_whitespace(wikicode, template) hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag parent = get_parent_wikicode(wikicode, template) adjacent = get_adjacent_node(parent, template, ignore_whitespace=True) if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint) if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # replace since the hint might be different wikicode.replace(adjacent, broken_flag) else: wikicode.insert_after(template, broken_flag) else: if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # package has been found again, remove existing flag wikicode.remove(adjacent) return wikicode
def check_trivial(self, wikilink): """ Perform trivial simplification, replace `[[Foo|foo]]` with `[[foo]]`. :param wikilink: instance of `mwparserfromhell.nodes.wikilink.Wikilink` representing the link to be checked """ # Wikicode.matches() ignores even the '#' character indicating relative links; # hence [[#foo|foo]] would be replaced with [[foo]] # Our canonicalize() function does exactly what we want and need. if wikilink.text is not None and canonicalize(wikilink.title) == canonicalize(wikilink.text): # title is mandatory, so the text becomes the title wikilink.title = wikilink.text wikilink.text = None
def update_page(self, title, text): """ Update package templates on given page. Parse wikitext, try to update all package templates, handle broken package links: - print warning to console - append message to self.log - mark it with {{Broken package link}} in the wikicode :param title: title of the wiki page :param text: content of the wiki page :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated content of the page """ logger.info("Parsing '%s'..." % title) lang = detect_language(title)[1] wikicode = mwparserfromhell.parse(text) for template in wikicode.ifilter_templates(): # skip unrelated templates if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]): continue # skip templates no longer under wikicode (templates nested under previously # removed parent template are still detected by ifilter) try: wikicode.index(template, True) except ValueError: continue hint = self.update_package_template(template, lang) # add/remove/update {{Broken package link}} flag parent = get_parent_wikicode(wikicode, template) adjacent = get_adjacent_node(parent, template, ignore_whitespace=True) if hint is not None: logger.warning("broken package link: {}: {}".format(template, hint)) self.add_report_line(title, template, hint) broken_flag = "{{%s|%s}}" % (self._localized_template("Broken package link", lang), hint) if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # replace since the hint might be different wikicode.replace(adjacent, broken_flag) else: wikicode.insert_after(template, broken_flag) else: if isinstance(adjacent, mwparserfromhell.nodes.Template) and canonicalize(adjacent.name).startswith("Broken package link"): # package has been found again, remove existing flag wikicode.remove(adjacent) return wikicode
def check_relative(self, src_title, wikilink, title): """ Use relative links whenever possible. For example, links to sections such as `[[Foo#Bar]]` on a page `title` are replaced with `[[#Bar]]` whenever `Foo` redirects to or is equivalent to `title`. :param str src_title: the title of the page being checked :param wikilink: the link to be checked :type wikilink: :py:class:`mwparserfromhell.nodes.wikilink.Wikilink` :param title: the parsed :py:attr:`wikilink.title` :type title: :py:class:`mw.parser_helpers.title.Title` """ if title.iwprefix or not title.sectionname: return # check if title is a redirect target = self.api.redirects.map.get(title.fullpagename) if target: _title = self.api.Title(target) _title.sectionname = title.sectionname else: _title = title if canonicalize(src_title) == _title.fullpagename: wikilink.title = "#" + _title.sectionname title.parse(wikilink.title)
def localize_flag(wikicode, node, template_name): """ If a ``node`` in ``wikicode`` is followed by a template with the same base name as ``template_name``, this function changes the adjacent template's name to ``template_name``. :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object :param node: a :py:class:`mwparserfromhell.nodes.Node` object :param str template_name: the name of the template flag, potentially including a language name """ parent = get_parent_wikicode(wikicode, node) adjacent = get_adjacent_node(parent, node, ignore_whitespace=True) if isinstance(adjacent, mwparserfromhell.nodes.Template): adjname = lang.detect_language(str(adjacent.name))[0] basename = lang.detect_language(template_name)[0] if canonicalize(adjname) == canonicalize(basename): adjacent.name = template_name
def _title_from_langlink(self, langlink): langname = lang.langname_for_tag(langlink["lang"]) title = lang.format_title(langlink["*"], langname) if lang.is_internal_tag(langlink["lang"]): title = canonicalize(title) # resolve redirects resolved = self.api.redirects.resolve(title) if resolved is not None: title = resolved.split("#", maxsplit=1)[0] return title
def gen_nodes(): for node_type, checkers in self.checkers.items(): for node in wikicode.ifilter(recursive=True, forcetype=node_type): # skip templates that may be added or removed if node_type is mwparserfromhell.nodes.Template and \ any(canonicalize(node.name).startswith(prefix) for prefix in self.skip_templates): continue # handle the node with all registered checkers for checker in checkers: yield checker, node
def _title_from_langlink(self, langlink): langname = lang.langname_for_tag(langlink["lang"]) if langname == "English": title = langlink["*"] else: title = "{} ({})".format(langlink["*"], langname) if lang.is_internal_tag(langlink["lang"]): title = canonicalize(title) # resolve redirects if title in self.redirects: title = self.redirects[title].split("#", maxsplit=1)[0] return title
def __init__(self, api, cliargs): self.api = api self.cliargs = cliargs if self.cliargs.save is False and self.cliargs.print is False: self.cliargs.print = True if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all": self.cliargs.toc_languages = lang.get_internal_tags() # strip "(Language)" suffix self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0] # detect page titles self.titles = [] for ln in sorted(self.cliargs.toc_languages): if ln == lang.tag_for_langname(lang.get_local_language()): self.titles.append(self.cliargs.toc_page) else: self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
def get_header_parts(wikicode, magics=None, cats=None, langlinks=None, remove_from_parent=False): """ According to Help:Style, the layout of the page should be as follows: 1. Magic words (optional) (includes only {{DISPLAYTITLE:...}} and {{Lowercase title}}) 2. Categories 3. Interlanguage links (if any) 4. Article status templates (optional) 5. Related articles box (optional) 6. Preface or introduction 7. Table of contents (automatic) 8. Article-specific sections Only 1-3 are safe to be updated automatically. This function will extract the header parts from the wikicode and return them as tuple ``(parent, magics, cats, langlinks)``, where ``parent`` is an instance of :py:class:`mwparserfromhell.wikicode.Wikicode` containing all extracted elements. It is assumed that all header elements are children of the same parent node, otherwise :py:exc:`HeaderError` is raised. If ``remove_from_parent`` is ``True``, the extracted header elements are also removed from the parent node and :py:func:`build_header` should be called to insert them back. The parameters ``magics``, ``cats`` and ``langlinks`` can be lists of objects (either string, wikicode or node) to be added to the header if not already present. These deduplication rules are applied: - supplied magic words take precedence over those present in wikicode - category links are considered duplicate when they point to the same category (e.g. [[Category:Foo]] is equivalent to [[category:foo]]) - interlanguage links are considered duplicate when they have the same language tag (i.e. there can be only one interlanguage link for each language) The lists of magics and langlinks are sorted, the order of catlinks is preserved. """ if magics is None: magics = [] if cats is None: cats = [] if langlinks is None: langlinks = [] # make sure that we work with `Wikicode` objects magics = [mwparserfromhell.utils.parse_anything(item) for item in magics] cats = [mwparserfromhell.utils.parse_anything(item) for item in cats] langlinks = [mwparserfromhell.utils.parse_anything(item) for item in langlinks] parent = None def _prefix(title): if ":" not in title: return "" return title.split(":", 1)[0].strip() # check the parent wikicode object and remove node from it def _remove(node): nonlocal parent if parent is None: parent = get_parent_wikicode(wikicode, node) else: p = get_parent_wikicode(wikicode, node) if parent is not p: raise HeaderError if remove_from_parent is True: remove_and_squash(parent, node) def _add_to_magics(template): _remove(template) if not any(magic.get(0).name.matches(template.name) for magic in magics): magics.append(mwparserfromhell.utils.parse_anything(template)) def _add_to_cats(catlink): # TODO: non-duplicate "typos" are still ignored -- is this important enough to handle it? if not any(cat.get(0).title.matches(catlink.title) for cat in cats): # only remove from wikicode if we actually append to cats (duplicate category # links are considered typos, e.g. [[Category:foo]] instead of [[:Category:foo]], # which are quite common) _remove(catlink) cats.append(mwparserfromhell.utils.parse_anything(catlink)) def _add_to_langlinks(langlink): # always remove langlinks to handle renaming of pages # (typos such as [[en:Main page]] in text are quite rare) _remove(langlink) if not any(_prefix(link.get(0).title).lower() == _prefix(langlink.title).lower() for link in langlinks): # not all tags work as interlanguage links if lang.is_interlanguage_tag(_prefix(langlink.title).lower()): langlinks.append(mwparserfromhell.utils.parse_anything(langlink)) # count extracted header elements _extracted_count = 0 for template in wikicode.filter_templates(): _pure, _ = lang.detect_language(str(template.name)) if canonicalize(template.name) == "Lowercase title" or _prefix(template.name) == "DISPLAYTITLE" or _pure in ["Template", "Template:Template"]: _add_to_magics(template) _extracted_count += 1 for link in wikicode.filter_wikilinks(): prefix = _prefix(link.title).lower() if prefix == "category": _add_to_cats(link) _extracted_count += 1 elif prefix in lang.get_language_tags(): _add_to_langlinks(link) _extracted_count += 1 magics.sort() langlinks.sort() if parent is None: if _extracted_count > 0: # this indicates parser error (e.g. unclosed <div> tags) raise HeaderError("no parent Wikicode object") else: # for pages without any header elements parent = wikicode return parent, magics, cats, langlinks
def _page_exists(self, title): # self.allpages does not include redirects, but that's fine... return canonicalize(title) in set(page["title"] for page in self.allpages)
def get_header_parts(wikicode, magics=None, cats=None, langlinks=None, remove_from_parent=False): """ According to Help:Style, the layout of the page should be as follows: 1. Magic words (optional) (includes only {{DISPLAYTITLE:...}} and {{Lowercase title}}) 2. Categories 3. Interlanguage links (if any) 4. Article status templates (optional) 5. Related articles box (optional) 6. Preface or introduction 7. Table of contents (automatic) 8. Article-specific sections Only 1-3 are safe to be updated automatically. This function will extract the header parts from the wikicode and return them as tuple ``(parent, magics, cats, langlinks)``, where ``parent`` is an instance of :py:class:`mwparserfromhell.wikicode.Wikicode` containing all extracted elements. It is assumed that all header elements are children of the same parent node, otherwise :py:exc:`HeaderError` is raised. If ``remove_from_parent`` is ``True``, the extracted header elements are also removed from the parent node and :py:func:`build_header` should be called to insert them back. The parameters ``magics``, ``cats`` and ``langlinks`` can be lists of objects (either string, wikicode or node) to be added to the header if not already present. These deduplication rules are applied: - supplied magic words take precedence over those present in wikicode - category links are considered duplicate when they point to the same category (e.g. [[Category:Foo]] is equivalent to [[category:foo]]) - interlanguage links are considered duplicate when they have the same language tag (i.e. there can be only one interlanguage link for each language) The lists of magics and langlinks are sorted, the order of catlinks is preserved. """ if magics is None: magics = [] if cats is None: cats = [] if langlinks is None: langlinks = [] # make sure that we work with `Wikicode` objects magics = [mwparserfromhell.utils.parse_anything(item) for item in magics] cats = [mwparserfromhell.utils.parse_anything(item) for item in cats] langlinks = [ mwparserfromhell.utils.parse_anything(item) for item in langlinks ] parent = None def _prefix(title): if ":" not in title: return "" return title.split(":", 1)[0].strip() # check the parent wikicode object and remove node from it def _remove(node): nonlocal parent if parent is None: parent = get_parent_wikicode(wikicode, node) else: p = get_parent_wikicode(wikicode, node) if parent is not p: raise HeaderError if remove_from_parent is True: remove_and_squash(parent, node) def _add_to_magics(template): _remove(template) if not any( magic.get(0).name.matches(template.name) for magic in magics): magics.append(mwparserfromhell.utils.parse_anything(template)) def _add_to_cats(catlink): # TODO: non-duplicate "typos" are still ignored -- is this important enough to handle it? if not any(cat.get(0).title.matches(catlink.title) for cat in cats): # only remove from wikicode if we actually append to cats (duplicate category # links are considered typos, e.g. [[Category:foo]] instead of [[:Category:foo]], # which are quite common) _remove(catlink) cats.append(mwparserfromhell.utils.parse_anything(catlink)) def _add_to_langlinks(langlink): # always remove langlinks to handle renaming of pages # (typos such as [[en:Main page]] in text are quite rare) _remove(langlink) if not any( _prefix(link.get(0).title).lower() == _prefix( langlink.title).lower() for link in langlinks): langlinks.append(mwparserfromhell.utils.parse_anything(langlink)) def _is_in_includeonly(node): ancestors = wikicode.get_ancestors(node) for a in ancestors: if isinstance(a, mwparserfromhell.nodes.tag.Tag) and a.tag.matches( "includeonly"): return True return False # count extracted header elements _extracted_count = 0 for template in wikicode.filter_templates(): if _is_in_includeonly(template): continue _pure, _ = lang.detect_language(str(template.name)) if canonicalize(template.name) == "Lowercase title" or _prefix( template.name) == "DISPLAYTITLE" or _pure in [ "Template", "Template:Template" ]: _add_to_magics(template) _extracted_count += 1 for link in wikicode.filter_wikilinks(): if _is_in_includeonly(link): continue prefix = _prefix(link.title).lower() if prefix == "category": _add_to_cats(link) _extracted_count += 1 # GOTCHA: not all tags work as interlanguage links elif lang.is_interlanguage_tag(prefix): _add_to_langlinks(link) _extracted_count += 1 magics.sort() langlinks.sort() if parent is None: if _extracted_count > 0: # this indicates parser error (e.g. unclosed <div> tags) raise HeaderError("no parent Wikicode object") else: # for pages without any header elements parent = wikicode return parent, magics, cats, langlinks