Esempio n. 1
0
 def run_data(self, item, word="testpage", lang="English",
              field="related", ruby="", sense=None, senses=[],
              ctx=None, is_reconstruction=False):
     """Runs a test where we expect the parsing to return None.  This
     function returns ``data``."""
     assert isinstance(item, str)
     assert isinstance(word, str)
     assert isinstance(lang, str)
     assert isinstance(field, str)
     assert isinstance(ruby, str)
     assert sense is None or isinstance(sense, str)
     assert isinstance(senses, list)
     assert ctx is None or isinstance(ctx, Wtp)
     ctx1 = ctx if ctx is not None else Wtp()
     self.ctx = ctx1
     self.config = WiktionaryConfig()
     self.ctx.start_page(word)
     self.ctx.start_section(lang)
     data = {}
     ret = parse_linkage_item_text(self.ctx, word, data, field, item,
                                   sense, ruby, senses, is_reconstruction)
     self.assertIs(ret, None)
     if ctx is None:
         self.assertEqual(self.ctx.errors, [])
         self.assertEqual(self.ctx.warnings, [])
         self.assertEqual(self.ctx.debugs, [])
     return data
Esempio n. 2
0
 def setUp(self):
     self.maxDiff = 20000
     self.ctx = Wtp()
     self.ctx.analyze_templates()
     self.ctx.start_page("testpage")
     self.config = WiktionaryConfig(capture_languages=None,
                                    capture_translations=True,
                                    capture_pronunciation=True,
                                    capture_linkages=True,
                                    capture_compounds=True,
                                    capture_redirects=True,
                                    capture_examples=True)
 def setUp(self):
     self.maxDiff = 100000
     self.ctx = Wtp()
     self.config = WiktionaryConfig()
     self.ctx.start_page("testpage")
     self.ctx.start_section("English")
Esempio n. 4
0
 def setUp(self):
     self.ctx = Wtp()
     self.config = WiktionaryConfig()
     self.ctx.start_page("testpage")
 def setUp(self):
     self.maxDiff = 20000
     self.ctx = Wtp()
     self.config = WiktionaryConfig()
     self.ctx.start_page("abolitionism")  # Note: some tests use last char
     self.ctx.start_section("English")
Esempio n. 6
0
class WiktExtractTests(unittest.TestCase):

    config = WiktionaryConfig()

    def test_pos(self):
        poses = wiktextract.PARTS_OF_SPEECH
        assert isinstance(poses, set)
        assert "noun" in poses
        assert "verb" in poses
        assert "pron" in poses
        assert "adj" in poses
        assert "adv" in poses
        assert "num" in poses
        assert len(poses) < 50

    def test_cv_plain(self):
        v = "This is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_comment(self):
        v = "This <!--comment--> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_unk(self):
        v = "This is a {{unknown-asdxfa}} test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_ref(self):
        v = "This <ref>junk\nmore junk</ref> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_html(self):
        v = "This <thispurportstobeatag> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_html2(self):
        v = "This </thispurportstobeatag> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link1(self):
        v = "This is a [[test]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link2(self):
        v = "This is a [[w:foo|test]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link3(self):
        v = "This is a [[w:foo|]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a foo.")

    def test_cv_link4(self):
        v = "This is a [[bar]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a bar.")

    def test_cv_htmllink(self):
        v = "This is a [http://ylonen.org test]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_q2(self):
        v = "This is a ''test''."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_q3(self):
        v = "This is a '''test'''."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_nbsp(self):
        v = "This is a&nbsp;test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_gt(self):
        v = "This is a &lt;test&gt;."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a <test>.")

    def test_cv_gt(self):
        v = "This is a t\u2019est."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a t'est.")

    def test_cv_sp(self):
        v = "  This\nis \na\n   test.\t"
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_presp(self):
        v = " This : is a test . "
        v = clean_value(self.config, v)
        self.assertEqual(v, "This: is a test.")

    def test_cv_presp(self):
        v = " This ; is a test , "
        v = clean_value(self.config, v)
        self.assertEqual(v, "This; is a test,")

    def test_cv_excl(self):
        v = " Run !\n"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Run!")

    def test_cv_ques(self):
        v = " Run ?\n"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Run?")
Esempio n. 7
0
class WiktExtractTests(unittest.TestCase):

    config = WiktionaryConfig()

    def test_pos(self):
        poses = wiktextract.PARTS_OF_SPEECH
        assert isinstance(poses, set)
        assert "noun" in poses
        assert "verb" in poses
        assert "pron" in poses
        assert "adj" in poses
        assert "adv" in poses
        assert "num" in poses
        assert len(poses) < 50

    def test_cv_plain(self):
        v = "This is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_comment(self):
        v = "This <!--comment--> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_repl0(self):
        v = "This is 1500 {{BC}}"
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is 1500 BC")

    def test_cv_repl1(self):
        v = "This is a {{given name|en|female}}"
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a female given name")

    def test_cv_repl1_arg1(self):
        v = "This is a {{given name|en|lang=fi|female}}"
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a female given name")

    def test_cv_repl1_arg2(self):
        v = "This is a {{given name|en|female|lang=fi}}"
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a female given name")

    def test_cv_repl1_surname(self):
        v = "This is a {{surname|from=nickname|lang=fi}}"
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a surname")

    def test_cv_repl1_taxon(self):
        v = "{{taxon|genus|family|Talpidae|[[insectivore]] mammals; typical [[mole]]s}}"
        v = clean_value(self.config, v)
        self.assertEqual(v, "taxonomic genus")

    def test_cv_arg1(self):
        v = "This is a {{w|test}}."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_arg2(self):
        v = "This is a {{w|test article|test value}}."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test value.")

    def test_cv_arg3(self):
        v = "This is a {{w2|fi||test}}."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_arg_nest(self):
        v = "This is a {{w2|fi||{{given name|en|male}}}}."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a male given name.")

    def test_cv_unk(self):
        v = "This is a {{unknown-asdxfa}} test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_ref(self):
        v = "This <ref>junk\nmore junk</ref> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_html(self):
        v = "This <thispurportstobeatag> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_html2(self):
        v = "This </thispurportstobeatag> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link1(self):
        v = "This is a [[test]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link2(self):
        v = "This is a [[w:foo|test]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link3(self):
        v = "This is a [[w:foo|]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a foo.")

    def test_cv_link4(self):
        v = "This is a [[bar]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a bar.")

    def test_cv_htmllink(self):
        v = "This is a [http://ylonen.org test]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_q2(self):
        v = "This is a ''test''."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_q3(self):
        v = "This is a '''test'''."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_nbsp(self):
        v = "This is a&nbsp;test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_gt(self):
        v = "This is a &lt;test&gt;."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a <test>.")

    def test_cv_gt(self):
        v = "This is a t\u2019est."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a t'est.")

    def test_cv_sp(self):
        v = "  This\nis \na\n   test.\t"
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_presp(self):
        v = " This : is a test . "
        v = clean_value(self.config, v)
        self.assertEqual(v, "This: is a test.")

    def test_cv_presp(self):
        v = " This ; is a test , "
        v = clean_value(self.config, v)
        self.assertEqual(v, "This; is a test,")

    def test_cv_excl(self):
        v = " Run !\n"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Run!")

    def test_cv_ques(self):
        v = " Run ?\n"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Run?")

    def test_cv_nested1(self):
        v = "{{acronym of|es|{{w|Geroa Bai|lang=es}}}}"
        v = clean_value(self.config, v)
        self.assertEqual(v, 'acronym of "Geroa Bai"')

    def test_page1(self):
        v = """
{{lb|en|agriculture|and|soil science|of pasture soils}} Tending toward [[scouring#Noun|scouring]] (diarrheal illness) in [[graze#Verb|grazing]] livestock, being high in [[molybdenum]] content and neutral to alkaline in [[pH]].
 {{quote-journal |en|url=https://www.cabdirect.org/cabdirect/abstract/19412200374 |last=Green |first=H.H. |year=1940 |title=[Abstract of a forthcoming bulletin from {{w|Imperial Chemical Industries}}] |journal=[https://www.cabi.org/publishing-products/online-information-resources/veterinary-bulletin/ Veterinary Bulletin] |passage=Abstract: The novelty of the subject matter and the fact that the information is conveyed in the form of a bulletin addressed to farmers, pending later publication of further experimental data in the scientific press, justifies a lengthy abstract for readers of the ''Veterinary Bulletin''. The local word "'''teart'''" (i.e. [[tart]]) is applied to land and pastures [in {{w|Somerset}}, {{w|Warwickshire}}, and {{w|Gloucestershire}}] upon which severe scouring occurs in grazing [[ruminant]]s, particularly cows [[lactating|in milk]] and young stock. Sheep are less affected, and horses and pigs appear to be unaffected. […] Most affected farms contain both '''teart''' and non-teart land and the degree of "[[teartness]]" varies with season and from field to field. […] The cause of [[teartness]] is traced to the presence of molybdenum in the herbage in amounts varying from 20-100 [[ppm|p.p.m.]] of the dry matter, and the degree of [[teartness]] is roughly proportional to the molybdenum content, particularly to the amount in water-soluble form. Of the total molybdenum present, about 80% is soluble in the case of green grass, about 40% in the case of hay, and 10% in the case of moribund winter herbage. Hence growing [[pasture]]s may be '''teart''' even when cut [[hay]] is not. […] [Various [[ameliorant]]s are available but] Wherever possible, however, it is advisable to convert '''teart''' pastures to [[arable]] land. [H.H. Green, [[abstracter]], in an abstract of a forthcoming bulletin from {{w|Imperial Chemical Industries}}.<ref name="oclc_41934659">{{cite-book |oclc=41934659 |year=1941 |author=Ferguson WS |author2=Lewis AH |author3=Watson SJ |editors= |title=The Teart Pasture of Somerset: Cause of Teartness and its Prevention. Bulletin No. 1 of the {{w|Jealott's Hill#Syngenta research site|Jealott's Hill Research Station}}.}}</ref>]}}
 {{quote-journal |en|doi=10.1017/S0021859600048371 |last=Lewis |first=AH |year=1943 |title=The '''teart''' pastures of Somerset: II. Relation between soil and teartness |journal={{w|The Journal of Agricultural Science}} |volume=33 |issue=1 |pages=52-57 |passage='''Teart''' soils contain {{w|molybdenum}} in amounts varying from about 0·002 to 0·010% in the {{w|soil horizon#A horizon|surface horizon}} and are [[neutral]] or [[alkaline]] in reaction and often [[calcareous]]. The contents of molybdenum increase down the soil profile. Those [soils] which are acid in reaction in the surface horizons are not '''teart''' even if their molybdenum content is high. […] How a knowledge of the relation between soil and [[teartness]] can be turned to practical advantage is briefly discussed.}}"""
        v = clean_value(self.config, v)
        self.assertEqual(
            v,
            "(agriculture and soil science of pasture soils) Tending toward scouring (diarrheal illness) in grazing livestock, being high in molybdenum content and neutral to alkaline in pH."
        )
Esempio n. 8
0
class WiktExtractTests(unittest.TestCase):

    config = WiktionaryConfig()

    def test_pos(self):
        poses = wiktextract.PARTS_OF_SPEECH
        assert isinstance(poses, set)
        assert "noun" in poses
        assert "verb" in poses
        assert "pron" in poses
        assert "adj" in poses
        assert "adv" in poses
        assert "num" in poses
        assert len(poses) < 50

    def test_cv_plain(self):
        v = "This is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_comment(self):
        v = "This <!--comment--> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_unk1(self):
        # We no longer clean unknown templates
        v = "This is a {{unknown-asdxfa}} test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a {{unknown-asdxfa}} test.")

    def test_cv_unk(self):
        # We no longer clean unknown template arguments
        v = "This is a {{{1}}} test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a {{{1}}} test.")

    def test_cv_ref(self):
        v = "This <ref>junk\nmore junk</ref> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_html(self):
        v = "This <thispurportstobeatag> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_html2(self):
        v = "This </thispurportstobeatag> is a test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link1(self):
        v = "This is a [[test]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link2(self):
        v = "This is a [[w:foo|test]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_link3(self):
        v = "This is a [[w:foo|]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a foo.")

    def test_cv_link4(self):
        v = "This is a [[bar]]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a bar.")

    def test_cv_link5(self):
        v = "([[w:Jurchen script|Jurchen script]]: , Image: [[FIle:Da (Jurchen script).png|25px]])"
        v = clean_value(self.config, v)
        self.assertEqual(v, "(Jurchen script: , Image: )")

    def test_cv_link6(self):
        v = "[[:w:Foo|Foo]]"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Foo")

    def test_cv_link7(self):
        v = "[[:w:Foo|Foo [...]]]"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Foo …")

    def test_cv_link8(self):
        v = "[[File:MiG-17F Top View.JPG|thumb|right|A MiG-17 jet.]]\nBorrowed"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Borrowed")

    def test_cv_url1(self):
        v = "This is a [http://ylonen.org test]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_url2(self):
        v = "This is a [http://ylonen.org test1 test2]."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test1 test2.")

    def test_cv_url3(self):
        v = "foo^([http://ylonen.org])"
        v = clean_value(self.config, v)
        self.assertEqual(v, "foo")

    def test_cv_url4(self):
        v = "foo^(http://ylonen.org)"
        v = clean_value(self.config, v)
        self.assertEqual(v, "foo")

    def test_cv_url5(self):
        v = "foo [http://ylonen.org]"
        v = clean_value(self.config, v)
        self.assertEqual(v, "foo http://ylonen.org")

    def test_cv_url6(self):
        v = "[[http://ylonen.org] FOO]"
        v = clean_value(self.config, v)
        self.assertEqual(v, "FOO")

    def test_cv_q2(self):
        v = "This is a ''test''."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_q3(self):
        v = "This is a '''test'''."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_nbsp(self):
        v = "This is a&nbsp;test."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a test.")

    def test_cv_gt(self):
        v = "This is a &lt;test&gt;."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a <test>.")

    def test_cv_unicode_apostrophe(self):
        v = "This is a t\u2019est."
        v = clean_value(self.config, v)
        self.assertEqual(v, "This is a t\u2019est.")

    def test_cv_sp(self):
        v = "  This\nis \na\n   test.\t"
        v = clean_value(self.config, v)
        # The code has been changed to keep newlines
        self.assertEqual(v, "This\nis\na\n test.")

    def test_cv_presp(self):
        v = " This : is a test . "
        v = clean_value(self.config, v)
        self.assertEqual(v, "This: is a test.")

    def test_cv_presp(self):
        v = " This ; is a test , "
        v = clean_value(self.config, v)
        self.assertEqual(v, "This ; is a test ,")

    def test_cv_excl(self):
        v = " Run !\n"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Run !")

    def test_cv_ques(self):
        v = " Run ?\n"
        v = clean_value(self.config, v)
        self.assertEqual(v, "Run ?")

    def test_cv_math1(self):
        v = r"foo <math>a \times \zeta = c</math> bar"
        v = clean_value(self.config, v)
        self.assertEqual(v, "foo a⨯𝜁=c bar")

    def test_cv_math2(self):
        v = r"<math>\frac{a}{b + c}</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "a/(b+c)")

    def test_cv_math3(self):
        v = r"<math>\frac{a + 1}{b + c}</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "(a+1)/(b+c)")

    def test_cv_math4(self):
        v = r"<math>\frac\alpha\beta</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "𝛼/𝛽")

    def test_cv_math5(self):
        v = r"<math>{\mathfrak A} - {\mathbb B} \cup {\mathcal K}</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "𝔄-𝔹∪𝒦")

    def test_cv_math6(self):
        v = r"<math>\sum_{i=0}^100 1/i</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "∑ᵢ₌₀¹⁰⁰1/i")

    def test_cv_math7(self):
        v = r"<math>x^\infty</math>"
        v = clean_value(self.config, v)
        print(ascii(v))
        self.assertEqual(v, "x\u2002᪲")

    def test_cv_math8(self):
        v = r"<math>4 7</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "4 7")

    def test_cv_math9(self):
        v = r"<math>a x + b</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "ax+b")

    def test_cv_math9(self):
        v = r"<math>4^7</math>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "4⁷")

    def test_cv_sup1(self):
        v = r"x<sup>3</sup>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "x³")

    def test_cv_sub1(self):
        v = r"x<sub>3</sub>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "x₃")

    def test_cv_chem1(self):
        v = r"<chem>H2O</chem>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "H₂O")

    def test_cv_ellipsis(self):
        v = "[...]"
        v = clean_value(self.config, v)
        self.assertEqual(v, "…")

    def test_cv_div1(self):
        v = "foo<div>bar</div>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "foo\nbar")

    def test_cv_paragraph1(self):
        v = "foo\n\nbar"
        v = clean_value(self.config, v)
        self.assertEqual(v, "foo\nbar")

    def test_cv_html_sp1(self):
        v = "<span>foo</span><span> bar</span>"
        v = clean_value(self.config, v)
        self.assertEqual(v, "foo bar")

    def test_cv_misc1(self):
        v = """<span style="font-style: normal;">[</span></span><span title="from Their First Rise and Settlement in the Island of Providence, to the Present Time. With the Remarkable Actions and Adventures of the Two Female Pyrates Mary Read and Anne Bonny; [...] To which is Added. A Short Abstract of the Statute and Civil Law, in Relation to Pyracy">  …\n      \n  </span><span class="q-hellip-b"><span style="font-style: normal;">]</span>"""
        v = clean_value(self.config, v)
        self.assertEqual(v, "[…]")