Beispiel #1
0
def combine_template_chains(wc: Wikicode, new_template_name: str,
                            template_indices: List[int],
                            text_indices: List[int]) -> None:
    """
    Helper function for combining templates that are linked via free text into
    a structured template hierarchy.
    """
    index_combos = []

    index_combo = []
    combine = False
    for i in template_indices:
        if (i + 1 in text_indices) or (i - 2 in index_combo and combine):
            index_combo.append(i)

        combine = i + 1 in text_indices
        if not combine:
            if len(index_combo) > 1:
                index_combos.append(index_combo)
            index_combo = []

    if len(index_combo) > 1:
        index_combos.append(index_combo)

    combo_nodes = [[wc.nodes[i] for i in chain] for chain in index_combos]

    for combo in combo_nodes:
        params = [
            Parameter(str(i + 1), t, showkey=False)
            for i, t in enumerate(combo)
        ]
        new_template = Template(new_template_name, params=params)
        wc.insert_before(combo[0], new_template, recursive=False)
        for node in combo:
            wc.remove(node, recursive=False)
Beispiel #2
0
    def __init__(self,
                 wikicode: Wikicode,
                 subordinates: List[Wikicode],
                 header_type: str = None,
                 idx: int = None,
                 metainfo=None,
                 lvl: int = 3):
        assert wikicode.startswith("=" * lvl) and not wikicode.startswith(
            "=" * (lvl + 1))  # assert that the level is correct
        if header_type is None:  # auto header type deduce
            breve: str = wikicode[
                lvl:]  # len("===") == 3 # "===Pronunciation===" --> "Pronunciation==="
            header_type = breve[:breve.index(
                "=" * lvl)]  # "Pronunciation===" -> "Pronunciation"

            if idx is None and header_type.startswith(
                    "Etymology "):  # auto idx deduce
                assert len(header_type) >= 11  # len("Etymology 1")
                idx = int(header_type[10:])
                assert idx and type(idx) == int
                header_type = "Etymology"
            defn_flag = header_type in poscodes.all
            if defn_flag and metainfo is None:
                metainfo = "Definition"

        self.wikicode = wikicode
        self.subordinates = subordinates
        self.idx = idx
        self.header_type = header_type
        self.metainfo = header_type if metainfo is None else metainfo  # TODO: make it into an enum
        self.lvl = lvl
Beispiel #3
0
def from_citeweb(wikicode: Wikicode) -> Wikicode:
    count = 0
    for t in wikicode.filter_templates():
        if t.name == 'Cite web' and not t.has(format) and not t.has('author'):
            title = getvalue(t, 'title')
            url = getvalue(t, 'url')
            date = format_date(getvalue(t, 'date'))
            website = getvalue(t, 'website')
            publisher = getvalue(t, 'publisher')
            accessdate = format_date(getvalue(t, 'accessdate')) if getvalue(
                t, 'accessdate') else format_date(getvalue(t, 'access-date'))

            if all(title, url, accessdate):
                new = f'“[{url} {title}]”'

                if publisher and website:
                    new += f'. \'\'{website}\'\'. {publisher} ({date}). ' if date else f'. \'\'{website}\'\'. {publisher}. '
                elif publisher and not website:
                    new += f'. {publisher} ({date}). ' if date else f'. {publisher}. '
                elif not publisher and website:
                    new += f'. \'\'{website}\'\' ({date}). ' if date else f'. \'\'{website}\'\'. '
                else:
                    new += f' ({date}) .' if date else f'. '
                new += f'{accessdate}閲覧。'

                wikicode.replace(t, new)
                count += 1

    pywikibot.output(f'{{{{Cite web}}}} を {count} 回置換しました')
    return wikicode
Beispiel #4
0
def get_tail_text(root: Wikicode, lb_idx: int) -> Optional[Wikicode]:
    """
    If an {{l|fi ...}} is followed by some text after a colon, we might want to
    put it in its description.
    """
    gathered = []
    idx = lb_idx
    idx += 1
    try:
        node = root.get(idx)
    except IndexError:
        return None
    if not isinstance(node, Text) or not node.value.startswith(":"):
        return None
    node.value = node.value[1:].strip()
    gathered.append(node)
    idx += 1
    while 1:
        try:
            node = root.get(idx)
        except IndexError:
            break
        if is_l_fi(node):
            break
        gathered.append(node)
        idx += 1
    return Wikicode(gathered)
Beispiel #5
0
 def repl_conditional(self, arg: Template, code: Wikicode,
                      index: Union[str, int]):
     if arg.has(index):
         param = arg.get(index)
         self.apply_wikitext(param.value)
         code.replace(
             arg,
             str(param.value).strip() if param.showkey else param.value)
     else:
         code.remove(arg)
Beispiel #6
0
def merge_etyl_templates(wc: Wikicode) -> Wikicode:
    """
    Given a chunk of wikicode, finds instances where the deprecated `etyl` template is immediately followed by
    either a word in free text, a linked word, or a generic `mention`/`link`/`langname-mention` template.
    It replaces this pattern with a new `derived-parsed` template -- meaning the same thing as the `derived` template
    but namespaced to differentiate. For cases where the `mention` language is different from the `etyl` language,
    we use the former. The template is removed if we can't parse it effectively.
    """
    etyl_indices = [
        i for i, node in enumerate(wc.nodes) if isinstance(node, Template)
        and node.name == "etyl" and i < len(wc.nodes) - 1
    ]

    nodes_to_remove = []
    for i in etyl_indices:
        make_new_template = False
        etyl: Template = wc.nodes[i]
        related_language = etyl.params[0]
        if len(etyl.params) == 1:
            language = "en"
        else:
            language = etyl.params[1]
        node = wc.nodes[i + 1]
        if isinstance(node, Text):
            val = re.split(",| |", node.value.strip())[0]
            if val:
                make_new_template = True
        elif isinstance(node, Wikilink):
            val = node.text or node.title
            val = re.split(",| |", val.strip())[0]
            if val:
                make_new_template = True
        elif isinstance(node, Template):
            if node.name in ("m", "mention", "m+", "langname-mention", "l",
                             "link"):
                related_language = node.params[0]
                if len(node.params) > 1:
                    val = node.params[1].value
                    make_new_template = True
                    nodes_to_remove.append(node)

        if make_new_template:
            params = [
                Parameter(str(i + 1), str(param), showkey=False)
                for i, param in enumerate([language, related_language, val])
            ]
            new_template = Template("derived-parsed", params=params)
            wc.replace(etyl, new_template, recursive=False)
        else:
            nodes_to_remove.append(etyl)

    for node in nodes_to_remove:
        wc.remove(node, recursive=False)
    return wc
Beispiel #7
0
def flag_template(self: TemplateParser,
                  code: Wikicode,
                  template: Template,
                  flag,
                  index=None):
    if index and template.has(index):
        param = template.get(index)
        self.apply_wikitext(param.value)
        code.replace(template, param)
    else:
        code.remove(template)
    self.state.flags.add(flag)
Beispiel #8
0
def clean_wikicode(wc: Wikicode):
    """
    Performs operations on each etymology section that get rid of extraneous nodes
    and create new templates based on natural-language parsing.
    """
    cleaner = lambda x: ((not isinstance(x, (Text, Wikilink, Template))) or
                         (isinstance(x, Text) and not bool(x.value.strip())))
    for node in wc.filter(recursive=False, matches=cleaner):
        wc.remove(node)

    merge_etyl_templates(wc)
    get_plus_combos(wc)
    get_comma_combos(wc)
    get_from_chains(wc)
    remove_links(wc)
Beispiel #9
0
def is_l_fi(template: Wikicode):
    if not isinstance(template, Template):
        return False
    if template.name != "l":
        return False
    lang = str(template.get(1).value)
    return lang == "fi"
Beispiel #10
0
def title_gen(wikicode: Wikicode) -> Iterator[Tuple[int, Tag]]:
    gen = wikicode._indexed_ifilter(matches="#", forcetype=Tag)
    prev_i = None
    for i, tag in gen:
        if tag != "#" or prev_i == i:
            continue
        yield i, tag
        prev_i = i
Beispiel #11
0
def handle_template(tpl, code, namespace=None):
    if tpl.has('sprache'):
        if tpl.get('sprache').value.strip().lower() in ('englisch', 'english'):
            set_param_value(tpl, 'sprache', 'en')
        if tpl.get('sprache').value.strip().lower() in ('deutsch', 'german'):
            set_param_value(tpl, 'sprache', 'de')

    if tpl.has('wann'):
        if tpl.get('wann').value.strip() in ('Sommersemester', 'ss'):
            set_param_value(tpl, 'wann', 'SS')
        elif tpl.get('wann').value.strip() in ('Wintersemester', 'ws'):
            set_param_value(tpl, 'wann', 'WS')
        elif tpl.get('wann').value.strip() in ('Winter- und Sommersemester',
                                               'Sommer- und Wintersemester'):
            set_param_value(tpl, 'wann', 'beide')
    if tpl.has('tiss'):
        if tpl.get('tiss').value.strip() == '1234567890':
            tpl.remove('tiss')

    archived = False
    successor = None
    if tpl.has('veraltet'):
        archived = True
        tpl.remove('veraltet')
    if tpl.has('nachfolger'):
        archived = True
        successor = tpl.get('nachfolger').value.strip()
        tpl.remove('nachfolger')
    for t in code.ifilter_templates(
            matches=lambda t: t.name.matches('Veraltet')):
        archived = True
        code.remove(t)
    archivedFlag = code.filter_templates(
        matches=lambda t: t.name.matches('Archiv'))
    if archived and not archivedFlag:
        tpl = Template(Wikicode([Text('Archiv')]))
        if successor:
            tpl.add('nachfolger', successor)
        code.insert(0, tpl)
        code.insert(1, '\n\n')

    if tpl.has('zuordnungen'):
        rels = tpl.get('zuordnungen').value.filter_templates()
        for rel in rels:
            if rel.has('2'):
                rel.get('2').value = str(rel.get('2').value).replace('–', '-')
        rels.sort(key=lambda x: x.get('1'))
        tpl.get('zuordnungen').value = '\n' + '\n'.join(
            [' ' * 4 + str(r) for r in rels]) + '\n'

    return 'fixe LVA-Daten'
Beispiel #12
0
def handle_der_container(ctx: ParseContext,
                         template: Wikicode) -> Iterator[Tuple[str, Any]]:
    """
    e.g.
    {{der2|fi|title=phrasal verbs
    |{{l|fi|[[pitää ääntä|''pitää'' ääntä]] + ''elative''|t=to [[make]] (a) [[noise]] about something {{q|e.g. about an egregious problem}}}}
    or e.g.
    {{der3|fi|title=nouns
    |pidike
    |pidin
    """
    idx = 1
    cats = []
    if template.has("title"):
        cats.append(str(template.get("title").value).strip())
    while 1:
        if not template.has(idx):
            break
        bit = template.get(idx).value
        idx += 1
        str_bit = str(bit)
        if str_bit == "fi":
            continue
        l_fi_templates = [
            idx for idx, template in bit._indexed_ifilter(recursive=False,
                                                          forcetype=Template)
            if is_l_fi(template)
        ]
        if len(l_fi_templates) == 1:
            res = get_deriv_lb_template(ctx, bit, l_fi_templates[0])
            if res is not None:
                yield res
        elif len(l_fi_templates) == 0:
            yield handle_deriv(ctx, bit, cats=cats)
        else:
            assert False, "too many {{l|fi ... }} templates"
Beispiel #13
0
def get_ety(wikicode: Wikicode):
    if " + " in wikicode:
        yield "exception", mk_unknown_structure("mwe-ety")
    templates = wikicode.filter_templates()
    t_match = template_matchers(templates)
    deriv_templates = t_match.intersection(ALL_DERIV_TEMPLATES)
    if len(deriv_templates) > 1:
        yield "exception", mk_unknown_structure("multi-template-ety")
    elif len(deriv_templates) == 1:
        yield from proc_ety_derivation_template(
            templates[t_match.index(deriv_templates[0])]
        )
    else:
        pass
    other_templates = t_match.difference(ALL_DERIV_TEMPLATES)
    for t_match in other_templates:
        yield "exception", mk_unknown_structure("unknown-template-ety", list(t_match))
Beispiel #14
0
    def _default_filter_categories(self, wikicode: Wikicode) -> List[str]:
        """Return a list of categories from wikicode."""
        wikilinks = wikicode.filter_wikilinks()
        ii = len("Category:")
        category_links = [
            el for el in wikilinks if el.title.lower().startswith("category:")
        ]
        category_titles = [
            el.title.rstrip()[ii:] for el in category_links
            if len(el.title.rstrip()[ii:]) > 0
        ]
        category_titles = [
            title[0].upper() + title[1:].replace(" ", "_")
            for title in category_titles
        ]

        return category_titles
Beispiel #15
0
    def parse(content: Wikicode):
        if type(content) is str:
            content = mwparserfromhell.parse(content)

        citation = Citation()
        for node in content.ifilter(recursive=False):
            if type(node) is ExternalLink:
                citation.url = str(node.url)
                if node.title:
                    citation.title = str(node.title)
            elif type(node) is Template:
                tname = Utils.homogenizeTemplateName(str(node.name))
                if tname.startswith('Cite'):
                    if tname in Parser.TEMPLATE_TYPE_MAPPING:
                        citation.type = Parser.TEMPLATE_TYPE_MAPPING[tname]

                    for param in node.params:
                        pname = str(param.name)
                        if pname in Parser.TEMPLATE_FIELD_MAPPING:
                            citation[
                                Parser.TEMPLATE_FIELD_MAPPING[pname]] = str(
                                    node.get(pname).value)
                        else:
                            # Do not touch citations with unparsable data
                            return False
                elif tname == 'Webarchive':
                    if node.has('url'):
                        citation.archiveurl = str(node.get('url').value)
                    if node.has('date'):
                        citation.archivedate = str(node.get('date').value)
                    if node.has('title'):
                        citation.title = str(node.get('title').value)
            elif type(node) is Text:
                if str(node).strip():
                    # Non-empty text node
                    return False
            else:
                # Do not touch citations with unparsable data
                return False

        if citation.isLocatable():
            return citation
        return False
Beispiel #16
0
def get_deriv_lb_template(ctx: ParseContext, parent: Wikicode, tmpl_idx: int):
    template = parent.get(tmpl_idx)
    if not template.has(2):
        return None
    link = template.get(2).value
    disp = None
    gloss = None
    cats = []
    if template.has(3):
        disp = str(template.get(3).value)
    if template.has("gloss"):
        gloss = str(template.get("gloss").value)
    if template.has("pos"):
        cats.append(str(template.get("pos").value))
    # TODO: Categories outside template
    if not gloss:
        tail_wikicode = get_tail_text(parent, tmpl_idx)
        gloss = str(tail_wikicode) if tail_wikicode else None
    return handle_deriv(ctx, link, disp, gloss, cats)
Beispiel #17
0
def get_deriv(ctx: ParseContext,
              pos_spec: Wikicode) -> Iterator[Tuple[str, Any]]:
    """
    e.g.
    {{der-top}}
    * adjectives: {{l|fi|tuleva}}
    ...
    * {{l|fi|tulla voimaan}}: {{q|of a law}} to {{l|en|take effect}}
    {{der-bottom}}
    """
    templates = pos_spec._indexed_ifilter(recursive=False, forcetype=Template)
    for idx, template in templates:
        name = str(template.name)
        if name in DER_CONTAINERS:
            for deriv in handle_der_container(ctx, template):
                yield deriv
        elif is_l_fi(template):
            deriv = get_deriv_lb_template(ctx, pos_spec, idx)
            if deriv is not None:
                yield deriv
Beispiel #18
0
def parse_nested_list(wikicode: Wikicode) -> TextTreeList:
    gen = peekable(title_gen(wikicode))
    root: TextTreeNode = TextTreeNode(None, [])
    path = [root]
    prev_level = -1
    parent_levels: List[int] = []

    def true_level():
        return len(parent_levels) - 1

    for i, tag in gen:
        level = get_level(wikicode, i)
        if level > prev_level:
            parent_levels.append(prev_level)
        while parent_levels[-1] >= level:
            parent_levels.pop()
            if len(parent_levels) == 0:
                print("parent_levels", parent_levels)
                raise
            path = path[:true_level() + 2]
        try:
            parent = path[true_level()]
        except:
            print("level, prev_level", level, prev_level)
            print(path, true_level(), parent_levels)
            raise
        next_i, _ = gen.peek((None, None))
        new_node = TextTreeNode(Wikicode(wikicode.nodes[i + level + 1:next_i]),
                                [])
        parent.children.append(new_node)
        if level > prev_level:
            # len(path) <= level + 1:
            path.append(new_node)
            # assert len(path) == level + 2
        else:
            path[-1] = new_node
        prev_level = level
    return root.children
Beispiel #19
0
def insert_template_after(tree, new, names):
    olds = tlgetall(tree, names)
    tree.insert_after(olds[0], Wikicode([Text("\n"), new]))
Beispiel #20
0
def get_lead(wikicode: Wikicode):
    sections = wikicode.get_sections(levels=[], include_lead=True)
    return sections[0]
Beispiel #21
0
def insert_template_before(tree, new, names):
    olds = tlgetall(tree, names)
    tree.insert_before(olds[0], Wikicode([new, Text("\n")]))
Beispiel #22
0
def block_templates(contents: Wikicode) -> List[Template]:
    return [
        t for t in contents.filter_templates()
        if t.name not in INLINE_TEMPLATES
    ]
Beispiel #23
0
def get_heading_node(wikicode: Wikicode):
    return next(wikicode.ifilter_headings(), None)
Beispiel #24
0
def is_defn(wc: Wikicode):

    return any(wc.startswith("===" + x)
               for x in all) or any(wc.startswith("====" + x)
                                    for x in all)  #TODO: support lvl5 headers
Beispiel #25
0
def double_strip(wikicode: Wikicode) -> str:
    stripped = wikicode.strip_code()
    double_stripped = parse(stripped).strip_code()
    double_stripped = double_stripped.replace("'''", "")
    double_stripped = double_stripped.replace("''", "")
    return double_stripped.strip()
from __future__ import unicode_literals

try:
    from unittest2 import TestCase
except ImportError:
    from unittest import TestCase

from mwparserfromhell.compat import range
from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity,
                                    Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter
from mwparserfromhell.smart_list import SmartList
from mwparserfromhell.wikicode import Wikicode

wrap = lambda L: Wikicode(SmartList(L))
wraptext = lambda *args: wrap([Text(t) for t in args])

class TreeEqualityTestCase(TestCase):
    """A base test case with support for comparing the equality of node trees.

    This adds a number of type equality functions, for Wikicode, Text,
    Templates, and Wikilinks.
    """

    def assertNodeEqual(self, expected, actual):
        """Assert that two Nodes have the same type and have the same data."""
        registry = {
            Argument: self.assertArgumentNodeEqual,
            Comment: self.assertCommentNodeEqual,
            Heading: self.assertHeadingNodeEqual,
    def _test_search(self, meth, expected):
        """Base test for insert_before(), insert_after(), and replace()."""
        code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}")
        func = partial(meth, code)
        func("{{b}}", "x", recursive=True)
        func("{{d}}", "[[y]]", recursive=False)
        func(code.get(2), "z")
        self.assertEqual(expected[0], code)
        self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True)
        self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False)
        fake = parse("{{a}}").get(0)
        self.assertRaises(ValueError, func, fake, "n", recursive=True)
        self.assertRaises(ValueError, func, fake, "n", recursive=False)

        code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}")
        func = partial(meth, code2)
        func(code2.get(1), "c", recursive=False)
        func("{{a}}", "d", recursive=False)
        func(code2.get(-1), "e", recursive=True)
        func("{{b}}", "f", recursive=True)
        self.assertEqual(expected[1], code2)

        code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}")
        func = partial(meth, code3)
        obj = code3.get(0).params[0].value.get(0)
        self.assertRaises(ValueError, func, obj, "x", recursive=False)
        func(obj, "x", recursive=True)
        self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False)
        func("{{f}}", "y", recursive=True)
        self.assertEqual(expected[2], code3)

        code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}")
        func = partial(meth, code4)
        fake = parse("{{b}}{{c}}")
        self.assertRaises(ValueError, func, fake, "q", recursive=False)
        self.assertRaises(ValueError, func, fake, "q", recursive=True)
        func("{{b}}{{c}}", "w", recursive=False)
        func("{{d}}{{e}}", "x", recursive=True)
        func(Wikicode(code4.nodes[-2:]), "y", recursive=False)
        func(Wikicode(code4.nodes[-2:]), "z", recursive=True)
        self.assertEqual(expected[3], code4)
        self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False)
        self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True)

        code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}")
        func = partial(meth, code5)
        self.assertRaises(ValueError, func, "{{b}}{{c}}", "x", recursive=False)
        func("{{b}}{{c}}", "x", recursive=True)
        obj = code5.get(0).params[1].value.get(0).params[0].value
        self.assertRaises(ValueError, func, obj, "y", recursive=False)
        func(obj, "y", recursive=True)
        self.assertEqual(expected[4], code5)

        code6 = parse("here is {{some text and a {{template}}}}")
        func = partial(meth, code6)
        self.assertRaises(ValueError, func, "text and", "ab", recursive=False)
        func("text and", "ab", recursive=True)
        self.assertRaises(ValueError, func, "is {{some", "cd", recursive=False)
        func("is {{some", "cd", recursive=True)
        self.assertEqual(expected[5], code6)

        code7 = parse("{{foo}}{{bar}}{{baz}}{{foo}}{{baz}}")
        func = partial(meth, code7)
        obj = wrap([code7.get(0), code7.get(2)])
        self.assertRaises(ValueError, func, obj, "{{lol}}")
        func("{{foo}}{{baz}}", "{{lol}}")
        self.assertEqual(expected[6], code7)

        code8 = parse("== header ==")
        func = partial(meth, code8)
        sec1, sec2 = code8.get_sections(include_headings=False)
        func(sec1, "lead\n")
        func(sec2, "\nbody")
        self.assertEqual(expected[7], code8)

        code9 = parse("{{foo}}")
        meth(code9.get_sections()[0], code9.get_sections()[0], "{{bar}}")
        meth(code9.get_sections()[0], code9, "{{baz}}")
        meth(code9, code9, "{{qux}}")
        meth(code9, code9.get_sections()[0], "{{quz}}")
        self.assertEqual(expected[8], code9)
Beispiel #28
0
def insert_template_at_start(tree, new):
    tree.insert(0, Wikicode([new, Text("\n")]))
Beispiel #29
0
def wikilinks_counts(wikicode: Wikicode) -> Counter:
    """Count unique wikilinks in a Wikicode object."""
    return Counter(str(x.title).strip() for x in wikicode.ifilter_wikilinks())
Beispiel #30
0
def template_counts(wikicode: Wikicode) -> Counter:
    """Count unique templates in a wikicode object"""
    return Counter(str(x.name).strip() for x in wikicode.ifilter_templates())