def _strip_tags(tree: mwparserfromhell.wikicode.Wikicode, threshold: int = 200): for tag in tree.filter_tags(): if tag.tag != 'ref' and not tag.attributes: tree.replace(tag, tag.contents) for tag in tree.filter_tags(): if tag.tag == 'ref' and len(tag.contents) < threshold: _tree_remove(tree, tag)
def remove_no_include( wikicode: mwparserfromhell.wikicode.Wikicode ) -> mwparserfromhell.wikicode.Wikicode: """Removes the noinclude tags""" for tag in wikicode.filter_tags(): if tag.tag.matches('noinclude'): try: wikicode.remove(tag) except ValueError: pass return wikicode
def naiveStrip(self, wikiCode:mwparserfromhell.wikicode.Wikicode): """removes the code delimiter, keeps the text. Accepts pages and sections in Wikicode format Arguments: wikiCode {mwparserfromhell.wikicode.Wikicode} -- wikicode for a page or a section. """ return wikiCode.strip_code()
def _strip_ext_links(tree: mwparserfromhell.wikicode.Wikicode, threshold: int = 250, url_threshold: int = 1000): for link in tree.filter_external_links(): if len(link.url) < url_threshold and (link.title is None or len(link.title) < threshold): _tree_remove(tree, link)
def replace_gallery_files( self, wikicode: mwparserfromhell.wikicode.Wikicode) -> None: """ Replace files in <gallery>. :param wikicode: Parsed wikitext """ for tag in wikicode.ifilter_tags(): if tag.tag.lower() != "gallery": continue lines = str(tag.contents).splitlines() for i, line in enumerate(lines): title, sep, caption = removeDisabledParts(line).partition("|") if not title: continue try: current_icon = BSiconPage(self.current_page.site, title) current_icon.title() except (pywikibot.exceptions.Error, ValueError): continue new_icon = self.opt.bsicons_map.get(current_icon, None) if new_icon: lines[i] = f"{new_icon.title()}{sep}{caption}" self.current_page.replacements.add( Replacement(current_icon, new_icon)) if self.current_page.replacements: tag.contents = "\n".join(lines) + "\n"
def _strip_templates(tree: mwparserfromhell.wikicode.Wikicode, threshold: int = 30): for template in tree.filter_templates(recursive=tree.RECURSE_OTHERS): for param in reversed(template.params): if len(param.name) < threshold and len(param.value) < threshold: template.remove(param) if not template.params and len(template.name) < threshold: _tree_remove(tree, template)
def _parse_case_page( index: str, name: str, tree: mwparserfromhell.wikicode.Wikicode) -> Optional[CasePage]: diff_sections = tree.get_sections( matches=lambda title: title.matches(name)) if not diff_sections and '<!-- Template:Courtesy blanked -->' in tree: return None assert len(diff_sections) == 1, tree.filter_headings() diff_section = diff_sections[0] assert diff_section.nodes[0].level == 2 excluded = [diff_section] unknown = [] for section in tree.get_sections(include_lead=False): if any(section in excl for excl in excluded): continue if section.nodes[0].title.matches(_IGNORED_HEADINGS): excluded.append(section) else: unknown.append(section) unknown = [ section for section in unknown if not any(section in excl for excl in excluded) ] assert not unknown, [section.nodes[0] for section in unknown] sections = {} for section in diff_section.get_sections(levels=[3]): title = section.nodes[0].title.strip() match = re.match(r'^Pages (\d+) through (\d+)$', title) assert match, title index = f'{match.group(1)}-{match.group(2)}' lines = [ line.strip() for line in section.splitlines()[1:] if line.strip() ] if _is_collapsed(lines): continue pages = [_parse_line(line) for line in lines] pages = [line for line in pages if line] if not pages: continue sections[index] = Section(title, pages) return CasePage(index, sections)
def parse_family(wikicode: mwp.wikicode.Wikicode): wikicode = parse_markup(wikicode) links = [ link for link in wikicode.filter_wikilinks() if "category" not in link.lower() ] if not links: print("unable to find family", wikicode) return None elif len(links) == 1: family = links[0].title.strip_code().strip() else: print("unexpected number of wikilinks", wikicode) for wc in links: wikicode.replace(wc, wc.title.strip_code().strip()) family = wikicode.strip_code().strip() return family
def keep_only_includes( wikicode: mwparserfromhell.wikicode.Wikicode ) -> mwparserfromhell.wikicode.Wikicode: """Keeps only the onlyincludes tags if any""" only_include_present = False to_remove = list() for tag in wikicode.filter_tags( recursive=False): # select only the most external one if tag.tag.matches('onlyinclude'): only_include_present = True else: to_remove.append(tag) if only_include_present: for tag in to_remove: try: wikicode.remove(tag) except ValueError: pass wikicode = mwparserfromhell.parse( re.sub(onlyinclude_tag, '', str(wikicode))) return wikicode
def parse_names(wikicode: mwp.wikicode.Wikicode): wikicode = parse_markup(wikicode) for t in wikicode.filter(forcetype=Template): if len(t.params) == 1: wikicode.replace(t, t.params[0]) elif len(t.params) > 1: if "highprince" in t.params[0].lower(): wikicode.replace(t, "{0} of {1}".format(*t.params)) elif "army" in t.params[0].lower(): wikicode.replace(t, "{1} {0}".format(*t.params)) return [ n for n in (n.strip() for n in wikicode.strip_code().split(",")) if n ]
def parse_markup(wikicode: mwp.wikicode.Wikicode): templates = wikicode.filter_templates() links = wikicode.filter_wikilinks() for t in templates: # remove references if "ref" in t.name or "wob" in t.name: wikicode.remove(t) for l in links: # simplify links text = l.text if l.text else l.title wikicode.replace(l, Wikilink(text)) return wikicode
def parse_abilities(wikicode: mwp.wikicode.Wikicode): wikicode = parse_markup(wikicode) abilities = [] for wc in wikicode.filter(forcetype=(Template, Wikilink)): if isinstance(wc, Template): if "tag" in wc.name: params = tuple( p.lower() for p in ( p.name.strip_code() if p.showkey is True else p.value.strip_code() for p in wc.params ) if p != "cat" ) if len(params) == 1: abilities.append(params[0]) elif len(params) > 1: if any( params[0] == s for s in ("shard", "vessel", "splinter") ): abilities.extend( (params[0], f"{params[0]} of {params[1]}") ) elif params[0] == "squire": abilities.append(f"squire ({params[1].split()[-1]})") else: print( "unknown ability while parsing character: ", params ) elif isinstance(wc, Wikilink): abilities.append(wc.title.strip_code().lower()) return abilities
def replace_template_files( self, wikicode: mwparserfromhell.wikicode.Wikicode) -> None: """ Replace files in templates. :param wikicode: Parsed wikitext """ assert self.site_config is not None for tpl in wikicode.ifilter_templates(): try: template = Page( self.current_page.site, removeDisabledParts(str(tpl.name)), ns=self.current_page.site.namespaces.TEMPLATE, ) template.title() except (pywikibot.exceptions.Error, ValueError): continue if template in self.site_config.routemap_templates: self._replace_routemap_files(tpl) elif template in self.site_config.railway_track_templates: self._replace_rt_template_files(tpl) else: self._replace_bs_template_files(tpl, template)
def parse_species(wikicode: mwp.wikicode.Wikicode): spec = species.get(wikicode.strip_code().lower().strip(), None) if spec is not None and any(s in val.lower() for s in ("spren", "cryptic")): char_info["subspecies"] = spec spec = "Spren" return spec
def parse_ethnicity(wikicode: mwp.wikicode.Wikicode): wikicode = parse_markup(wikicode) wikicode_elements = wikicode.filter(forcetype=(Template, Wikilink)) if " and " in wikicode: wikicode.replace(" and ", ", ") if not wikicode_elements: ethnicity = wikicode.strip_code().strip().title() else: for wc in wikicode_elements: if isinstance(wc, Wikilink): wikicode.replace(wc, wc.title.strip_code().strip()) elif isinstance(wc, Template): params = tuple( p for p in ( p.name.strip_code() if p.showkey is True else p.value.strip_code() for p in wc.params ) if p != "cat" ) if len(params) == 1: wikicode.replace(wc, params[0]) elif "Noble" in params or "noble" in params: wikicode.replace(wc, "Noble") else: wikicode.replace(wc, params) ethnicity = wikicode.strip_code().strip().title() if ethnicity == "Skaa, Noble": ethnicity = "Half-Skaa" return ethnicity
def parse_residence(wikicode: mwp.wikicode.Wikicode): wikicode = parse_markup(wikicode) wikicode_elements = wikicode.filter(forcetype=(Template, Wikilink)) if "<br>" in wikicode: wikicode.replace("<br>", ", ") if not wikicode_elements: residence = wikicode.strip_code().strip() else: for wc in wikicode_elements: if isinstance(wc, Wikilink): wikicode.replace(wc, wc.title.strip_code().strip().title()) elif isinstance(wc, Template): params = tuple( p for p in ( p.name.strip_code() if p.showkey is True else p.value.strip_code() for p in wc.params ) if p != "cat" ) if len(params) == 1: wikicode.replace(wc, params[0]) else: wikicode.replace(wc, params) residence = wikicode.strip_code().strip() residence = re.sub(r"\s?\([\w\s]+\)", "", residence) # special cases if residence.startswith("15 Stranat Place"): residence = "Elendel" return residence
def parse_nation(wikicode: mwp.wikicode.Wikicode): return demonyms.get(wikicode.strip_code().lower().strip(), None)
def parse_profession(wikicode: mwp.wikicode.Wikicode): # todo: parse with simple NLP return str(wikicode.strip_code().lower().strip())
def _tree_remove(tree: mwparserfromhell.wikicode.Wikicode, node: mwparserfromhell.nodes.Node): try: tree.remove(node) except ValueError: pass
def _strip_wikilinks(tree: mwparserfromhell.wikicode.Wikicode, threshold: int = 50): for link in tree.filter_wikilinks(): if len(link.title) < threshold and (link.text is None or len(link.text) < threshold): _tree_remove(tree, link)