def read_wiki_data(input_filename):
    fp = open(input_filename, "r", encoding="utf-8")
    input_text = fp.readlines()
    fp.close()
    input_text = "".join(input_text)

    wikitextparser.WikiText(input_text)
    parsed_text = wikitextparser.WikiText(input_text)

    article_sections = {}
    for section in parsed_text.sections:
        if section.title == " Intro paragraph ":
            article_sections["intro_paragraph"] = section.string
        elif section.title == " Losers ":
            article_sections["loser_paragraphs"] = section.string
        elif section.title == " Winners ":
            article_sections["winner_paragraphs"] = section.string
        elif section.title == " Still in the Running ":
            article_sections["running_paragraphs"] = section.string
        elif section.title == " Biggies ":
            article_sections["biggie_paragraphs"] = section.string
        elif section.title == " Hidden Gems ":
            article_sections["gem_paragraphs"] = section.string
        elif section.title == " Conclusion ":
            article_sections["conclusion_paragraph"] = section.string

    return article_sections
Exemple #2
0
 def test_template_name_cannot_be_empty(self):
     wt = wtp.WikiText('{{_}}')
     self.assertEqual(wt._type_to_spans['Template'], [])
     wt = wtp.WikiText('{{_|text}}')
     self.assertEqual(wt._type_to_spans['Template'], [])
     wt = wtp.WikiText('{{text| {{_}} }}')
     self.assertEqual(len(wt._type_to_spans['Template']), 1)
     wt = wtp.WikiText('{{ {{_|text}} | a }}')
     self.assertEqual(len(wt._type_to_spans['Template']), 0)
Exemple #3
0
 def test_textmixed_multitemplate(self):
     wt = wtp.WikiText('text1{{cite|{{t1}}|{{t2}}}}'
                       'text2{{cite|{{t3}}|{{t4}}}}text3')
     self.assertEqual(
         wt._type_to_spans['Template'],
         [[12, 18], [19, 25], [39, 45], [46, 52], [5, 27], [32, 54]],
     )
Exemple #4
0
 def test_section_title_may_contain_template_newline_etc(self):
     wt = wtp.WikiText('=== h3 {{text\n\n|text}}<!-- \nc -->'
                       '<nowiki>\nnw\n</nowiki> ===\nt3')
     sections = wt.sections
     self.assertEqual(2, len(sections))
     self.assertEqual(
         ' h3 {{text\n\n|text}}<!-- \nc --><nowiki>\nnw\n</nowiki> ',
         sections[1].title)
     self.assertEqual('t3', sections[1].contents)
Exemple #5
0
 def test_template_inside_parameter(self):
     wt = wtp.WikiText('{{{1|{{colorbox|yellow|text1}}}}}')
     self.assertEqual(
         [[5, 30]],
         wt._type_to_spans['Template'],
     )
     self.assertEqual(
         [[0, 33]],
         wt._type_to_spans['Parameter'],
     )
Exemple #6
0
 def test_parameter_inside_template(self):
     wt = wtp.WikiText('{{colorbox|yellow|{{{1|defualt_text}}}}}')
     self.assertEqual(
         [[0, 40]],
         wt._type_to_spans['Template'],
     )
     self.assertEqual(
         [[18, 38]],
         wt._type_to_spans['Parameter'],
     )
Exemple #7
0
    def test_invoking_a_named_ref_is_not_a_ref_start(self):
        """See [[mw:Extension:Cite#Multiple_uses_of_the_same_footnote]].

        [[mw:Help:Extension:Cite]] may be helpful, too.

        """
        wt = wtp.WikiText(
            '{{text|1=v<ref name=n/>}}\ntext.<ref name=n>r</ref>')
        self.assertEqual(
            [[0, 25]],
            wt._type_to_spans['Template'],
        )
Exemple #8
0
 def test_extracting_sections(self):
     wt = wtp.WikiText('== h2 ==\nt2\n\n=== h3 ===\nt3\n\n== h22 ==\nt22')
     sections = wt.sections
     self.assertEqual(4, len(sections))
     self.assertEqual(0, sections[0].level)
     self.assertEqual('', sections[0].title)
     self.assertEqual('', sections[0].contents)
     self.assertEqual('== h2 ==\nt2\n\n=== h3 ===\nt3\n\n',
                      str(sections[1]))
     wt = wtp.WikiText(
         '\n== 1 ==\n== 2 ==\n=== 2.1 ===\n==== 2.1.1 ====\n'
         '===== 2.1.1.1 =====\n=== 2.2 ===\n=== 2.3 ===\n==== 2.3.1 ====\n'
         '2.3.1\n== 3 ==\n')
     self.assertEqual(
         "[Section('\\n'), Section('== 1 ==\\n'), "
         "Section('== 2 ==\\n=== 2.1 ===\\n==== 2.1.1 ====\\n"
         "===== 2.1.1.1 =====\\n=== 2.2 ===\\n=== 2.3 ===\\n"
         "==== 2.3.1 ====\\n2.3.1\\n'), Section('=== 2.1 ===\\n"
         "==== 2.1.1 ====\\n===== 2.1.1.1 =====\\n'), "
         "Section('==== 2.1.1 ====\\n===== 2.1.1.1 =====\\n'), "
         "Section('===== 2.1.1.1 =====\\n'), Section('=== 2.2 ===\\n'), "
         "Section('=== 2.3 ===\\n==== 2.3.1 ====\\n2.3.1\\n'), "
         "Section('==== 2.3.1 ====\\n2.3.1\\n'), Section('== 3 ==\\n')]",
         str(wt.sections))
    def _generate_cards(src_word: str, pos: str, word_rank: int,
                        subsections: List[wtp.Section]) -> List[Card]:
        """
        Extract relevant definitions from a part-of-speech section and format
        them appropriately for further processing.
        """
        # Detect remains of templates whose definitions should be deleted completely
        p = re.compile(r"^.*\$\$[^\{\}]*\$\$.*$")
        entries = []

        for pos_section in subsections:
            # Retrieve all definitions from the section
            items = [
                wtp.WikiText(def_item) for def_list in pos_section.get_lists()
                for def_item in def_list.items
            ]

            # Format definition as plain text
            formatted_items = [
                item.plain_text(replace_templates=False).strip()
                for item in items
            ]

            # Evaluate template expressions
            processed_items = [
                tp.process_templates(formatted_item)
                for formatted_item in formatted_items
            ]

            # Clean up unwanted definitions
            cleaned_items = [
                processed_item.strip() for processed_item in processed_items
                if not p.match(processed_item)
            ]

            # Allow for differentiated styling of remarks in parentheses for readability
            finalized_items = [
                CardDataGenerator._add_parenthesed_styling(cleaned_item)
                for cleaned_item in cleaned_items
            ]

            # Generate Card
            true_rank = word_rank if pos_section.title.lower() == pos.lower(
            ) else 1_000_000 + word_rank
            entries.append(
                Card(src_word, pos_section.title, true_rank, finalized_items))

        return entries
Exemple #10
0
 def test_keyword_and_positional_args_removal(self):
     wt = wtp.WikiText("text{{t1|kw=a|1=|pa|kw2=a|pa2}}{{t2|a|1|1=}}text")
     t1 = wt.templates[0]
     t2 = wt.templates[1]
     self.assertEqual('1', t1.arguments[2].name)
     self.assertEqual('kw2', t1.arguments[3].name)
     self.assertEqual('2', t1.arguments[4].name)
     self.assertEqual('1', t2.arguments[0].name)
     self.assertEqual('2', t2.arguments[1].name)
     self.assertEqual('1', t2.arguments[2].name)
     t1.arguments[0].string = ''
     self.assertEqual('1', t1.arguments[0].name)
     self.assertEqual('kw2', t1.arguments[2].name)
     self.assertEqual('|pa2', t1.arguments[3].string)
     self.assertEqual('1', t2.arguments[0].name)
     self.assertEqual('2', t2.arguments[1].name)
     self.assertEqual('1', t2.arguments[2].name)
     t1.arguments[1].string = ''
     self.assertEqual("text{{t1|1=|kw2=a|pa2}}{{t2|a|1|1=}}text", wt.string)
     self.assertEqual('pa2', t1.arguments[2].value)
     self.assertEqual('1', t1.arguments[2].name)
     self.assertEqual('a', t2.arguments[0].value)
     self.assertEqual('1', t2.arguments[0].name)
Exemple #11
0
 def test_no_template_for_braces_around_wikilink(self):
     wt = wtp.WikiText('{{[[a]]}}')
     self.assertEqual(
         [],
         wt._type_to_spans['Template'],
     )
Exemple #12
0
 def test_invalid_refs_that_should_not_produce_any_template(self):
     wt = wtp.WikiText('f {{text|<ref \n > g}} <ref  name=n />\n</ref  >\n')
     self.assertEqual(
         [],
         wt._type_to_spans['Template'],
     )
Exemple #13
0
 def test_lacks_ending_braces(self):
     wt = wtp.WikiText('{{cite|{{t1}}|{{t2}}')
     self.assertEqual(
         [[7, 13], [14, 20]],
         wt._type_to_spans['Template'],
     )
Exemple #14
0
 def test_lacks_starting_braces(self):
     wt = wtp.WikiText('cite|{{t1}}|{{t2}}}}')
     self.assertEqual(
         [[5, 11], [12, 18]],
         wt._type_to_spans['Template'],
     )
Exemple #15
0
 def test_multiline_mutitemplate(self):
     wt = wtp.WikiText('{{cite\n    |{{t1}}\n    |{{t2}}}}')
     self.assertEqual(
         wt._type_to_spans['Template'],
         [[12, 18], [24, 30], [0, 32]],
     )
Exemple #16
0
 def test_wikilinks_inside_exttags(self):
     self.assertEqual(
         [[5, 10]],
         wtp.WikiText('<ref>[[w]]</ref>')._type_to_spans['WikiLink'],
     )
Exemple #17
0
 def test_unicode_parser_function(self):
     wt = wtp.WikiText('{{#اگر:|فلان}}')
     self.assertEqual(
         [[0, 14]],
         wt._type_to_spans['ParserFunction'],
     )
Exemple #18
0
 def test_unicode_parameters(self):
     wt = wtp.WikiText('{{{پارا۱|{{{پارا۲|پيشفرض}}}}}}')
     self.assertEqual(
         [[9, 27], [0, 30]],
         wt._type_to_spans['Parameter'],
     )
Exemple #19
0
 def test_template_name_cannot_contain_newline(self):
     tl = wtp.WikiText('{{\nColor\nbox\n|mytext}}')
     self.assertEqual(
         [],
         tl._type_to_spans['Template'],
     )
Exemple #20
0
 def test_unicode_template(self):
     wt = wtp.WikiText('{{\nرنگ\n|متن}}')
     self.assertEqual(
         [[0, 13]],
         wt._type_to_spans['Template'],
     )
Exemple #21
0
 def test_template_in_template(self):
     wt = wtp.WikiText('{{cite|{{t1}}|{{t2}}}}')
     template_spans = wt._type_to_spans['Template']
     self.assertIn([7, 13], template_spans)
     self.assertIn([14, 20], template_spans)
     self.assertIn([0, 22], template_spans)