def combine_template_chains(wc: Wikicode, new_template_name: str, template_indices: List[int], text_indices: List[int]) -> None: """ Helper function for combining templates that are linked via free text into a structured template hierarchy. """ index_combos = [] index_combo = [] combine = False for i in template_indices: if (i + 1 in text_indices) or (i - 2 in index_combo and combine): index_combo.append(i) combine = i + 1 in text_indices if not combine: if len(index_combo) > 1: index_combos.append(index_combo) index_combo = [] if len(index_combo) > 1: index_combos.append(index_combo) combo_nodes = [[wc.nodes[i] for i in chain] for chain in index_combos] for combo in combo_nodes: params = [ Parameter(str(i + 1), t, showkey=False) for i, t in enumerate(combo) ] new_template = Template(new_template_name, params=params) wc.insert_before(combo[0], new_template, recursive=False) for node in combo: wc.remove(node, recursive=False)
def __init__(self, wikicode: Wikicode, subordinates: List[Wikicode], header_type: str = None, idx: int = None, metainfo=None, lvl: int = 3): assert wikicode.startswith("=" * lvl) and not wikicode.startswith( "=" * (lvl + 1)) # assert that the level is correct if header_type is None: # auto header type deduce breve: str = wikicode[ lvl:] # len("===") == 3 # "===Pronunciation===" --> "Pronunciation===" header_type = breve[:breve.index( "=" * lvl)] # "Pronunciation===" -> "Pronunciation" if idx is None and header_type.startswith( "Etymology "): # auto idx deduce assert len(header_type) >= 11 # len("Etymology 1") idx = int(header_type[10:]) assert idx and type(idx) == int header_type = "Etymology" defn_flag = header_type in poscodes.all if defn_flag and metainfo is None: metainfo = "Definition" self.wikicode = wikicode self.subordinates = subordinates self.idx = idx self.header_type = header_type self.metainfo = header_type if metainfo is None else metainfo # TODO: make it into an enum self.lvl = lvl
def from_citeweb(wikicode: Wikicode) -> Wikicode: count = 0 for t in wikicode.filter_templates(): if t.name == 'Cite web' and not t.has(format) and not t.has('author'): title = getvalue(t, 'title') url = getvalue(t, 'url') date = format_date(getvalue(t, 'date')) website = getvalue(t, 'website') publisher = getvalue(t, 'publisher') accessdate = format_date(getvalue(t, 'accessdate')) if getvalue( t, 'accessdate') else format_date(getvalue(t, 'access-date')) if all(title, url, accessdate): new = f'“[{url} {title}]”' if publisher and website: new += f'. \'\'{website}\'\'. {publisher} ({date}). ' if date else f'. \'\'{website}\'\'. {publisher}. ' elif publisher and not website: new += f'. {publisher} ({date}). ' if date else f'. {publisher}. ' elif not publisher and website: new += f'. \'\'{website}\'\' ({date}). ' if date else f'. \'\'{website}\'\'. ' else: new += f' ({date}) .' if date else f'. ' new += f'{accessdate}閲覧。' wikicode.replace(t, new) count += 1 pywikibot.output(f'{{{{Cite web}}}} を {count} 回置換しました') return wikicode
def get_tail_text(root: Wikicode, lb_idx: int) -> Optional[Wikicode]: """ If an {{l|fi ...}} is followed by some text after a colon, we might want to put it in its description. """ gathered = [] idx = lb_idx idx += 1 try: node = root.get(idx) except IndexError: return None if not isinstance(node, Text) or not node.value.startswith(":"): return None node.value = node.value[1:].strip() gathered.append(node) idx += 1 while 1: try: node = root.get(idx) except IndexError: break if is_l_fi(node): break gathered.append(node) idx += 1 return Wikicode(gathered)
def repl_conditional(self, arg: Template, code: Wikicode, index: Union[str, int]): if arg.has(index): param = arg.get(index) self.apply_wikitext(param.value) code.replace( arg, str(param.value).strip() if param.showkey else param.value) else: code.remove(arg)
def merge_etyl_templates(wc: Wikicode) -> Wikicode: """ Given a chunk of wikicode, finds instances where the deprecated `etyl` template is immediately followed by either a word in free text, a linked word, or a generic `mention`/`link`/`langname-mention` template. It replaces this pattern with a new `derived-parsed` template -- meaning the same thing as the `derived` template but namespaced to differentiate. For cases where the `mention` language is different from the `etyl` language, we use the former. The template is removed if we can't parse it effectively. """ etyl_indices = [ i for i, node in enumerate(wc.nodes) if isinstance(node, Template) and node.name == "etyl" and i < len(wc.nodes) - 1 ] nodes_to_remove = [] for i in etyl_indices: make_new_template = False etyl: Template = wc.nodes[i] related_language = etyl.params[0] if len(etyl.params) == 1: language = "en" else: language = etyl.params[1] node = wc.nodes[i + 1] if isinstance(node, Text): val = re.split(",| |", node.value.strip())[0] if val: make_new_template = True elif isinstance(node, Wikilink): val = node.text or node.title val = re.split(",| |", val.strip())[0] if val: make_new_template = True elif isinstance(node, Template): if node.name in ("m", "mention", "m+", "langname-mention", "l", "link"): related_language = node.params[0] if len(node.params) > 1: val = node.params[1].value make_new_template = True nodes_to_remove.append(node) if make_new_template: params = [ Parameter(str(i + 1), str(param), showkey=False) for i, param in enumerate([language, related_language, val]) ] new_template = Template("derived-parsed", params=params) wc.replace(etyl, new_template, recursive=False) else: nodes_to_remove.append(etyl) for node in nodes_to_remove: wc.remove(node, recursive=False) return wc
def flag_template(self: TemplateParser, code: Wikicode, template: Template, flag, index=None): if index and template.has(index): param = template.get(index) self.apply_wikitext(param.value) code.replace(template, param) else: code.remove(template) self.state.flags.add(flag)
def clean_wikicode(wc: Wikicode): """ Performs operations on each etymology section that get rid of extraneous nodes and create new templates based on natural-language parsing. """ cleaner = lambda x: ((not isinstance(x, (Text, Wikilink, Template))) or (isinstance(x, Text) and not bool(x.value.strip()))) for node in wc.filter(recursive=False, matches=cleaner): wc.remove(node) merge_etyl_templates(wc) get_plus_combos(wc) get_comma_combos(wc) get_from_chains(wc) remove_links(wc)
def is_l_fi(template: Wikicode): if not isinstance(template, Template): return False if template.name != "l": return False lang = str(template.get(1).value) return lang == "fi"
def title_gen(wikicode: Wikicode) -> Iterator[Tuple[int, Tag]]: gen = wikicode._indexed_ifilter(matches="#", forcetype=Tag) prev_i = None for i, tag in gen: if tag != "#" or prev_i == i: continue yield i, tag prev_i = i
def handle_template(tpl, code, namespace=None): if tpl.has('sprache'): if tpl.get('sprache').value.strip().lower() in ('englisch', 'english'): set_param_value(tpl, 'sprache', 'en') if tpl.get('sprache').value.strip().lower() in ('deutsch', 'german'): set_param_value(tpl, 'sprache', 'de') if tpl.has('wann'): if tpl.get('wann').value.strip() in ('Sommersemester', 'ss'): set_param_value(tpl, 'wann', 'SS') elif tpl.get('wann').value.strip() in ('Wintersemester', 'ws'): set_param_value(tpl, 'wann', 'WS') elif tpl.get('wann').value.strip() in ('Winter- und Sommersemester', 'Sommer- und Wintersemester'): set_param_value(tpl, 'wann', 'beide') if tpl.has('tiss'): if tpl.get('tiss').value.strip() == '1234567890': tpl.remove('tiss') archived = False successor = None if tpl.has('veraltet'): archived = True tpl.remove('veraltet') if tpl.has('nachfolger'): archived = True successor = tpl.get('nachfolger').value.strip() tpl.remove('nachfolger') for t in code.ifilter_templates( matches=lambda t: t.name.matches('Veraltet')): archived = True code.remove(t) archivedFlag = code.filter_templates( matches=lambda t: t.name.matches('Archiv')) if archived and not archivedFlag: tpl = Template(Wikicode([Text('Archiv')])) if successor: tpl.add('nachfolger', successor) code.insert(0, tpl) code.insert(1, '\n\n') if tpl.has('zuordnungen'): rels = tpl.get('zuordnungen').value.filter_templates() for rel in rels: if rel.has('2'): rel.get('2').value = str(rel.get('2').value).replace('–', '-') rels.sort(key=lambda x: x.get('1')) tpl.get('zuordnungen').value = '\n' + '\n'.join( [' ' * 4 + str(r) for r in rels]) + '\n' return 'fixe LVA-Daten'
def handle_der_container(ctx: ParseContext, template: Wikicode) -> Iterator[Tuple[str, Any]]: """ e.g. {{der2|fi|title=phrasal verbs |{{l|fi|[[pitää ääntä|''pitää'' ääntä]] + ''elative''|t=to [[make]] (a) [[noise]] about something {{q|e.g. about an egregious problem}}}} or e.g. {{der3|fi|title=nouns |pidike |pidin """ idx = 1 cats = [] if template.has("title"): cats.append(str(template.get("title").value).strip()) while 1: if not template.has(idx): break bit = template.get(idx).value idx += 1 str_bit = str(bit) if str_bit == "fi": continue l_fi_templates = [ idx for idx, template in bit._indexed_ifilter(recursive=False, forcetype=Template) if is_l_fi(template) ] if len(l_fi_templates) == 1: res = get_deriv_lb_template(ctx, bit, l_fi_templates[0]) if res is not None: yield res elif len(l_fi_templates) == 0: yield handle_deriv(ctx, bit, cats=cats) else: assert False, "too many {{l|fi ... }} templates"
def get_ety(wikicode: Wikicode): if " + " in wikicode: yield "exception", mk_unknown_structure("mwe-ety") templates = wikicode.filter_templates() t_match = template_matchers(templates) deriv_templates = t_match.intersection(ALL_DERIV_TEMPLATES) if len(deriv_templates) > 1: yield "exception", mk_unknown_structure("multi-template-ety") elif len(deriv_templates) == 1: yield from proc_ety_derivation_template( templates[t_match.index(deriv_templates[0])] ) else: pass other_templates = t_match.difference(ALL_DERIV_TEMPLATES) for t_match in other_templates: yield "exception", mk_unknown_structure("unknown-template-ety", list(t_match))
def _default_filter_categories(self, wikicode: Wikicode) -> List[str]: """Return a list of categories from wikicode.""" wikilinks = wikicode.filter_wikilinks() ii = len("Category:") category_links = [ el for el in wikilinks if el.title.lower().startswith("category:") ] category_titles = [ el.title.rstrip()[ii:] for el in category_links if len(el.title.rstrip()[ii:]) > 0 ] category_titles = [ title[0].upper() + title[1:].replace(" ", "_") for title in category_titles ] return category_titles
def parse(content: Wikicode): if type(content) is str: content = mwparserfromhell.parse(content) citation = Citation() for node in content.ifilter(recursive=False): if type(node) is ExternalLink: citation.url = str(node.url) if node.title: citation.title = str(node.title) elif type(node) is Template: tname = Utils.homogenizeTemplateName(str(node.name)) if tname.startswith('Cite'): if tname in Parser.TEMPLATE_TYPE_MAPPING: citation.type = Parser.TEMPLATE_TYPE_MAPPING[tname] for param in node.params: pname = str(param.name) if pname in Parser.TEMPLATE_FIELD_MAPPING: citation[ Parser.TEMPLATE_FIELD_MAPPING[pname]] = str( node.get(pname).value) else: # Do not touch citations with unparsable data return False elif tname == 'Webarchive': if node.has('url'): citation.archiveurl = str(node.get('url').value) if node.has('date'): citation.archivedate = str(node.get('date').value) if node.has('title'): citation.title = str(node.get('title').value) elif type(node) is Text: if str(node).strip(): # Non-empty text node return False else: # Do not touch citations with unparsable data return False if citation.isLocatable(): return citation return False
def get_deriv_lb_template(ctx: ParseContext, parent: Wikicode, tmpl_idx: int): template = parent.get(tmpl_idx) if not template.has(2): return None link = template.get(2).value disp = None gloss = None cats = [] if template.has(3): disp = str(template.get(3).value) if template.has("gloss"): gloss = str(template.get("gloss").value) if template.has("pos"): cats.append(str(template.get("pos").value)) # TODO: Categories outside template if not gloss: tail_wikicode = get_tail_text(parent, tmpl_idx) gloss = str(tail_wikicode) if tail_wikicode else None return handle_deriv(ctx, link, disp, gloss, cats)
def get_deriv(ctx: ParseContext, pos_spec: Wikicode) -> Iterator[Tuple[str, Any]]: """ e.g. {{der-top}} * adjectives: {{l|fi|tuleva}} ... * {{l|fi|tulla voimaan}}: {{q|of a law}} to {{l|en|take effect}} {{der-bottom}} """ templates = pos_spec._indexed_ifilter(recursive=False, forcetype=Template) for idx, template in templates: name = str(template.name) if name in DER_CONTAINERS: for deriv in handle_der_container(ctx, template): yield deriv elif is_l_fi(template): deriv = get_deriv_lb_template(ctx, pos_spec, idx) if deriv is not None: yield deriv
def parse_nested_list(wikicode: Wikicode) -> TextTreeList: gen = peekable(title_gen(wikicode)) root: TextTreeNode = TextTreeNode(None, []) path = [root] prev_level = -1 parent_levels: List[int] = [] def true_level(): return len(parent_levels) - 1 for i, tag in gen: level = get_level(wikicode, i) if level > prev_level: parent_levels.append(prev_level) while parent_levels[-1] >= level: parent_levels.pop() if len(parent_levels) == 0: print("parent_levels", parent_levels) raise path = path[:true_level() + 2] try: parent = path[true_level()] except: print("level, prev_level", level, prev_level) print(path, true_level(), parent_levels) raise next_i, _ = gen.peek((None, None)) new_node = TextTreeNode(Wikicode(wikicode.nodes[i + level + 1:next_i]), []) parent.children.append(new_node) if level > prev_level: # len(path) <= level + 1: path.append(new_node) # assert len(path) == level + 2 else: path[-1] = new_node prev_level = level return root.children
def insert_template_after(tree, new, names): olds = tlgetall(tree, names) tree.insert_after(olds[0], Wikicode([Text("\n"), new]))
def get_lead(wikicode: Wikicode): sections = wikicode.get_sections(levels=[], include_lead=True) return sections[0]
def insert_template_before(tree, new, names): olds = tlgetall(tree, names) tree.insert_before(olds[0], Wikicode([new, Text("\n")]))
def block_templates(contents: Wikicode) -> List[Template]: return [ t for t in contents.filter_templates() if t.name not in INLINE_TEMPLATES ]
def get_heading_node(wikicode: Wikicode): return next(wikicode.ifilter_headings(), None)
def is_defn(wc: Wikicode): return any(wc.startswith("===" + x) for x in all) or any(wc.startswith("====" + x) for x in all) #TODO: support lvl5 headers
def double_strip(wikicode: Wikicode) -> str: stripped = wikicode.strip_code() double_stripped = parse(stripped).strip_code() double_stripped = double_stripped.replace("'''", "") double_stripped = double_stripped.replace("''", "") return double_stripped.strip()
from __future__ import unicode_literals try: from unittest2 import TestCase except ImportError: from unittest import TestCase from mwparserfromhell.compat import range from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter from mwparserfromhell.smart_list import SmartList from mwparserfromhell.wikicode import Wikicode wrap = lambda L: Wikicode(SmartList(L)) wraptext = lambda *args: wrap([Text(t) for t in args]) class TreeEqualityTestCase(TestCase): """A base test case with support for comparing the equality of node trees. This adds a number of type equality functions, for Wikicode, Text, Templates, and Wikilinks. """ def assertNodeEqual(self, expected, actual): """Assert that two Nodes have the same type and have the same data.""" registry = { Argument: self.assertArgumentNodeEqual, Comment: self.assertCommentNodeEqual, Heading: self.assertHeadingNodeEqual,
def _test_search(self, meth, expected): """Base test for insert_before(), insert_after(), and replace().""" code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}") func = partial(meth, code) func("{{b}}", "x", recursive=True) func("{{d}}", "[[y]]", recursive=False) func(code.get(2), "z") self.assertEqual(expected[0], code) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False) fake = parse("{{a}}").get(0) self.assertRaises(ValueError, func, fake, "n", recursive=True) self.assertRaises(ValueError, func, fake, "n", recursive=False) code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}") func = partial(meth, code2) func(code2.get(1), "c", recursive=False) func("{{a}}", "d", recursive=False) func(code2.get(-1), "e", recursive=True) func("{{b}}", "f", recursive=True) self.assertEqual(expected[1], code2) code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") func = partial(meth, code3) obj = code3.get(0).params[0].value.get(0) self.assertRaises(ValueError, func, obj, "x", recursive=False) func(obj, "x", recursive=True) self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False) func("{{f}}", "y", recursive=True) self.assertEqual(expected[2], code3) code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}") func = partial(meth, code4) fake = parse("{{b}}{{c}}") self.assertRaises(ValueError, func, fake, "q", recursive=False) self.assertRaises(ValueError, func, fake, "q", recursive=True) func("{{b}}{{c}}", "w", recursive=False) func("{{d}}{{e}}", "x", recursive=True) func(Wikicode(code4.nodes[-2:]), "y", recursive=False) func(Wikicode(code4.nodes[-2:]), "z", recursive=True) self.assertEqual(expected[3], code4) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True) code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}") func = partial(meth, code5) self.assertRaises(ValueError, func, "{{b}}{{c}}", "x", recursive=False) func("{{b}}{{c}}", "x", recursive=True) obj = code5.get(0).params[1].value.get(0).params[0].value self.assertRaises(ValueError, func, obj, "y", recursive=False) func(obj, "y", recursive=True) self.assertEqual(expected[4], code5) code6 = parse("here is {{some text and a {{template}}}}") func = partial(meth, code6) self.assertRaises(ValueError, func, "text and", "ab", recursive=False) func("text and", "ab", recursive=True) self.assertRaises(ValueError, func, "is {{some", "cd", recursive=False) func("is {{some", "cd", recursive=True) self.assertEqual(expected[5], code6) code7 = parse("{{foo}}{{bar}}{{baz}}{{foo}}{{baz}}") func = partial(meth, code7) obj = wrap([code7.get(0), code7.get(2)]) self.assertRaises(ValueError, func, obj, "{{lol}}") func("{{foo}}{{baz}}", "{{lol}}") self.assertEqual(expected[6], code7) code8 = parse("== header ==") func = partial(meth, code8) sec1, sec2 = code8.get_sections(include_headings=False) func(sec1, "lead\n") func(sec2, "\nbody") self.assertEqual(expected[7], code8) code9 = parse("{{foo}}") meth(code9.get_sections()[0], code9.get_sections()[0], "{{bar}}") meth(code9.get_sections()[0], code9, "{{baz}}") meth(code9, code9, "{{qux}}") meth(code9, code9.get_sections()[0], "{{quz}}") self.assertEqual(expected[8], code9)
def insert_template_at_start(tree, new): tree.insert(0, Wikicode([new, Text("\n")]))
def wikilinks_counts(wikicode: Wikicode) -> Counter: """Count unique wikilinks in a Wikicode object.""" return Counter(str(x.title).strip() for x in wikicode.ifilter_wikilinks())
def template_counts(wikicode: Wikicode) -> Counter: """Count unique templates in a wikicode object""" return Counter(str(x.name).strip() for x in wikicode.ifilter_templates())