def read_wiki_data(input_filename): fp = open(input_filename, "r", encoding="utf-8") input_text = fp.readlines() fp.close() input_text = "".join(input_text) wikitextparser.WikiText(input_text) parsed_text = wikitextparser.WikiText(input_text) article_sections = {} for section in parsed_text.sections: if section.title == " Intro paragraph ": article_sections["intro_paragraph"] = section.string elif section.title == " Losers ": article_sections["loser_paragraphs"] = section.string elif section.title == " Winners ": article_sections["winner_paragraphs"] = section.string elif section.title == " Still in the Running ": article_sections["running_paragraphs"] = section.string elif section.title == " Biggies ": article_sections["biggie_paragraphs"] = section.string elif section.title == " Hidden Gems ": article_sections["gem_paragraphs"] = section.string elif section.title == " Conclusion ": article_sections["conclusion_paragraph"] = section.string return article_sections
def test_template_name_cannot_be_empty(self): wt = wtp.WikiText('{{_}}') self.assertEqual(wt._type_to_spans['Template'], []) wt = wtp.WikiText('{{_|text}}') self.assertEqual(wt._type_to_spans['Template'], []) wt = wtp.WikiText('{{text| {{_}} }}') self.assertEqual(len(wt._type_to_spans['Template']), 1) wt = wtp.WikiText('{{ {{_|text}} | a }}') self.assertEqual(len(wt._type_to_spans['Template']), 0)
def test_textmixed_multitemplate(self): wt = wtp.WikiText('text1{{cite|{{t1}}|{{t2}}}}' 'text2{{cite|{{t3}}|{{t4}}}}text3') self.assertEqual( wt._type_to_spans['Template'], [[12, 18], [19, 25], [39, 45], [46, 52], [5, 27], [32, 54]], )
def test_section_title_may_contain_template_newline_etc(self): wt = wtp.WikiText('=== h3 {{text\n\n|text}}<!-- \nc -->' '<nowiki>\nnw\n</nowiki> ===\nt3') sections = wt.sections self.assertEqual(2, len(sections)) self.assertEqual( ' h3 {{text\n\n|text}}<!-- \nc --><nowiki>\nnw\n</nowiki> ', sections[1].title) self.assertEqual('t3', sections[1].contents)
def test_template_inside_parameter(self): wt = wtp.WikiText('{{{1|{{colorbox|yellow|text1}}}}}') self.assertEqual( [[5, 30]], wt._type_to_spans['Template'], ) self.assertEqual( [[0, 33]], wt._type_to_spans['Parameter'], )
def test_parameter_inside_template(self): wt = wtp.WikiText('{{colorbox|yellow|{{{1|defualt_text}}}}}') self.assertEqual( [[0, 40]], wt._type_to_spans['Template'], ) self.assertEqual( [[18, 38]], wt._type_to_spans['Parameter'], )
def test_invoking_a_named_ref_is_not_a_ref_start(self): """See [[mw:Extension:Cite#Multiple_uses_of_the_same_footnote]]. [[mw:Help:Extension:Cite]] may be helpful, too. """ wt = wtp.WikiText( '{{text|1=v<ref name=n/>}}\ntext.<ref name=n>r</ref>') self.assertEqual( [[0, 25]], wt._type_to_spans['Template'], )
def test_extracting_sections(self): wt = wtp.WikiText('== h2 ==\nt2\n\n=== h3 ===\nt3\n\n== h22 ==\nt22') sections = wt.sections self.assertEqual(4, len(sections)) self.assertEqual(0, sections[0].level) self.assertEqual('', sections[0].title) self.assertEqual('', sections[0].contents) self.assertEqual('== h2 ==\nt2\n\n=== h3 ===\nt3\n\n', str(sections[1])) wt = wtp.WikiText( '\n== 1 ==\n== 2 ==\n=== 2.1 ===\n==== 2.1.1 ====\n' '===== 2.1.1.1 =====\n=== 2.2 ===\n=== 2.3 ===\n==== 2.3.1 ====\n' '2.3.1\n== 3 ==\n') self.assertEqual( "[Section('\\n'), Section('== 1 ==\\n'), " "Section('== 2 ==\\n=== 2.1 ===\\n==== 2.1.1 ====\\n" "===== 2.1.1.1 =====\\n=== 2.2 ===\\n=== 2.3 ===\\n" "==== 2.3.1 ====\\n2.3.1\\n'), Section('=== 2.1 ===\\n" "==== 2.1.1 ====\\n===== 2.1.1.1 =====\\n'), " "Section('==== 2.1.1 ====\\n===== 2.1.1.1 =====\\n'), " "Section('===== 2.1.1.1 =====\\n'), Section('=== 2.2 ===\\n'), " "Section('=== 2.3 ===\\n==== 2.3.1 ====\\n2.3.1\\n'), " "Section('==== 2.3.1 ====\\n2.3.1\\n'), Section('== 3 ==\\n')]", str(wt.sections))
def _generate_cards(src_word: str, pos: str, word_rank: int, subsections: List[wtp.Section]) -> List[Card]: """ Extract relevant definitions from a part-of-speech section and format them appropriately for further processing. """ # Detect remains of templates whose definitions should be deleted completely p = re.compile(r"^.*\$\$[^\{\}]*\$\$.*$") entries = [] for pos_section in subsections: # Retrieve all definitions from the section items = [ wtp.WikiText(def_item) for def_list in pos_section.get_lists() for def_item in def_list.items ] # Format definition as plain text formatted_items = [ item.plain_text(replace_templates=False).strip() for item in items ] # Evaluate template expressions processed_items = [ tp.process_templates(formatted_item) for formatted_item in formatted_items ] # Clean up unwanted definitions cleaned_items = [ processed_item.strip() for processed_item in processed_items if not p.match(processed_item) ] # Allow for differentiated styling of remarks in parentheses for readability finalized_items = [ CardDataGenerator._add_parenthesed_styling(cleaned_item) for cleaned_item in cleaned_items ] # Generate Card true_rank = word_rank if pos_section.title.lower() == pos.lower( ) else 1_000_000 + word_rank entries.append( Card(src_word, pos_section.title, true_rank, finalized_items)) return entries
def test_keyword_and_positional_args_removal(self): wt = wtp.WikiText("text{{t1|kw=a|1=|pa|kw2=a|pa2}}{{t2|a|1|1=}}text") t1 = wt.templates[0] t2 = wt.templates[1] self.assertEqual('1', t1.arguments[2].name) self.assertEqual('kw2', t1.arguments[3].name) self.assertEqual('2', t1.arguments[4].name) self.assertEqual('1', t2.arguments[0].name) self.assertEqual('2', t2.arguments[1].name) self.assertEqual('1', t2.arguments[2].name) t1.arguments[0].string = '' self.assertEqual('1', t1.arguments[0].name) self.assertEqual('kw2', t1.arguments[2].name) self.assertEqual('|pa2', t1.arguments[3].string) self.assertEqual('1', t2.arguments[0].name) self.assertEqual('2', t2.arguments[1].name) self.assertEqual('1', t2.arguments[2].name) t1.arguments[1].string = '' self.assertEqual("text{{t1|1=|kw2=a|pa2}}{{t2|a|1|1=}}text", wt.string) self.assertEqual('pa2', t1.arguments[2].value) self.assertEqual('1', t1.arguments[2].name) self.assertEqual('a', t2.arguments[0].value) self.assertEqual('1', t2.arguments[0].name)
def test_no_template_for_braces_around_wikilink(self): wt = wtp.WikiText('{{[[a]]}}') self.assertEqual( [], wt._type_to_spans['Template'], )
def test_invalid_refs_that_should_not_produce_any_template(self): wt = wtp.WikiText('f {{text|<ref \n > g}} <ref name=n />\n</ref >\n') self.assertEqual( [], wt._type_to_spans['Template'], )
def test_lacks_ending_braces(self): wt = wtp.WikiText('{{cite|{{t1}}|{{t2}}') self.assertEqual( [[7, 13], [14, 20]], wt._type_to_spans['Template'], )
def test_lacks_starting_braces(self): wt = wtp.WikiText('cite|{{t1}}|{{t2}}}}') self.assertEqual( [[5, 11], [12, 18]], wt._type_to_spans['Template'], )
def test_multiline_mutitemplate(self): wt = wtp.WikiText('{{cite\n |{{t1}}\n |{{t2}}}}') self.assertEqual( wt._type_to_spans['Template'], [[12, 18], [24, 30], [0, 32]], )
def test_wikilinks_inside_exttags(self): self.assertEqual( [[5, 10]], wtp.WikiText('<ref>[[w]]</ref>')._type_to_spans['WikiLink'], )
def test_unicode_parser_function(self): wt = wtp.WikiText('{{#اگر:|فلان}}') self.assertEqual( [[0, 14]], wt._type_to_spans['ParserFunction'], )
def test_unicode_parameters(self): wt = wtp.WikiText('{{{پارا۱|{{{پارا۲|پيشفرض}}}}}}') self.assertEqual( [[9, 27], [0, 30]], wt._type_to_spans['Parameter'], )
def test_template_name_cannot_contain_newline(self): tl = wtp.WikiText('{{\nColor\nbox\n|mytext}}') self.assertEqual( [], tl._type_to_spans['Template'], )
def test_unicode_template(self): wt = wtp.WikiText('{{\nرنگ\n|متن}}') self.assertEqual( [[0, 13]], wt._type_to_spans['Template'], )
def test_template_in_template(self): wt = wtp.WikiText('{{cite|{{t1}}|{{t2}}}}') template_spans = wt._type_to_spans['Template'] self.assertIn([7, 13], template_spans) self.assertIn([14, 20], template_spans) self.assertIn([0, 22], template_spans)