Example #1
0
 def run_data(self, item, word="testpage", lang="English",
              field="related", ruby="", sense=None, senses=[],
              ctx=None, is_reconstruction=False):
     """Runs a test where we expect the parsing to return None.  This
     function returns ``data``."""
     assert isinstance(item, str)
     assert isinstance(word, str)
     assert isinstance(lang, str)
     assert isinstance(field, str)
     assert isinstance(ruby, str)
     assert sense is None or isinstance(sense, str)
     assert isinstance(senses, list)
     assert ctx is None or isinstance(ctx, Wtp)
     ctx1 = ctx if ctx is not None else Wtp()
     self.ctx = ctx1
     self.config = WiktionaryConfig()
     self.ctx.start_page(word)
     self.ctx.start_section(lang)
     data = {}
     ret = parse_linkage_item_text(self.ctx, word, data, field, item,
                                   sense, ruby, senses, is_reconstruction)
     self.assertIs(ret, None)
     if ctx is None:
         self.assertEqual(self.ctx.errors, [])
         self.assertEqual(self.ctx.warnings, [])
         self.assertEqual(self.ctx.debugs, [])
     return data
Example #2
0
 def test_gender5(self):
     # Numeric inflection classes should only be interpreted for certain
     # languages (e.g., Bantu languages)
     ctx = Wtp()  # To allow debug messages
     data = self.run_data("foo 1", lang="Swedish", ctx=ctx)
     self.assertEqual(data, {"related": [
         {"word": "foo 1"}]})
Example #3
0
 def test_gender13(self):
     # They should not be interpreted for other languages
     ctx = Wtp()  # To allow debug messages
     data = self.run_data("foo 1 or 2", lang="English", ctx=ctx)
     self.assertEqual(data, {"related": [
         {"word": "foo 1"},
         {"word": "2"},
     ]})
Example #4
0
    def test_long(self):
        # Just parse through the data and make sure that we find some words
        # This takes about 0.5 minutes.

        langs = collections.defaultdict(int)
        words = collections.defaultdict(int)
        poses = collections.defaultdict(int)
        num_transl = 0
        num_redirects = 0

        def word_cb(data):
            nonlocal num_transl
            nonlocal num_redirects
            if "redirect" in data:
                assert isinstance(data["redirect"], str)
                word = data["title"]
                words[word] += 1
                num_redirects += 1
                return
            word = data["word"]
            assert word
            words[word] += 1
            lang = data["lang"]
            pos = data["pos"]
            assert word and lang and pos
            langs[lang] += 1
            poses[pos] += 1
            if data.get("translations"):
                num_transl += 1
            for sense in data.get("senses", ()):
                if sense.get("translations"):
                    num_transl += 1

        path = "tests/test-pages-articles.xml.bz2"
        print("Parsing test data")
        ctx = Wtp()
        config = WiktionaryConfig(capture_languages=[
            "English", "Finnish", "Spanish", "German", "Chinese", "Japanese",
            "Italian", "Portuguese", "Translingual"
        ],
                                  capture_translations=True,
                                  capture_pronunciation=True,
                                  capture_linkages=True,
                                  capture_compounds=True,
                                  capture_redirects=True)
        parse_wiktionary(ctx, path, config, word_cb, None)
        print("Test data parsing complete")
        assert num_redirects > 0
        assert len(words) > 100
        assert all(x < 50 for x in words.values())
        assert langs["English"] > 0
        assert langs["Finnish"] > 0
        assert langs["Translingual"] > 0
        assert len(langs.keys()) == 9
        assert len(poses.keys()) <= len(wiktextract.PARTS_OF_SPEECH)
        assert sum(poses.values()) == sum(langs.values())
        assert sum(words.values()) == sum(poses.values()) + num_redirects
        assert num_transl > 0
Example #5
0
 def test_gender15(self):
     # inclusive or/English/Translations
     ctx = Wtp()  # To allow debug messages
     data = self.run_data("μη αποκλειστικό or n (mi apokleistikó or)",
                          lang="Greek", word="inclusive or", ctx=ctx)
     self.assertEqual(data, {"related": [
         {"word": "μη αποκλειστικό or", "tags": ["neuter"],
          "roman": "mi apokleistikó or"},
     ]})
Example #6
0
def parse_with_ctx(title, text, **kwargs):
    assert isinstance(title, str)
    assert isinstance(text, str)
    ctx = Wtp()
    ctx.analyze_templates()
    ctx.start_page(title)
    root = ctx.parse(text, **kwargs)
    print("parse_with_ctx: root", type(root), root)
    return root, ctx
Example #7
0
 def test_prefix16(self):
     # Triggers an error due to invalid gloss reference
     ctx = Wtp()
     data = self.run_data("(4): foo", ctx=ctx, senses=[
         {"glosses": ["sense1"]},
         {"glosses": ["sense2", "sense2b"]},
         {"glosses": ["sense3"]}])
     self.assertEqual(data, {"related": [
         {"word": "foo"}]})
     self.assertNotEqual(ctx.debugs, [])
Example #8
0
 def setUp(self):
     self.maxDiff = 20000
     self.ctx = Wtp()
     self.ctx.analyze_templates()
     self.ctx.start_page("testpage")
     self.config = WiktionaryConfig(capture_languages=None,
                                    capture_translations=True,
                                    capture_pronunciation=True,
                                    capture_linkages=True,
                                    capture_compounds=True,
                                    capture_redirects=True,
                                    capture_examples=True)
Example #9
0
    def runonce(self, num_threads):
        # Just parse through the data and make sure that we find some words
        path = "tests/test-pages-articles.xml.bz2"
        print("Parsing test data")
        ctx = Wtp(num_threads=num_threads)
        ret = ctx.process(path, page_cb)
        titles = collections.defaultdict(int)
        redirects = collections.defaultdict(int)
        for title, redirect_to in ret:
            titles[title] += 1
            if redirect_to is not None:
                redirects[redirect_to] += 1

        print("Test data parsing complete")
        assert sum(redirects.values()) > 0
        assert len(titles) > 100
        assert all(x == 1 for x in titles.values())
        assert len(redirects) > 1
 def setUp(self):
     self.maxDiff = 100000
     self.ctx = Wtp()
     self.config = WiktionaryConfig()
     self.ctx.start_page("testpage")
     self.ctx.start_section("English")
Example #11
0
 def setUp(self):
     self.ctx = Wtp()
     self.config = WiktionaryConfig()
     self.ctx.start_page("testpage")
Example #12
0
 def test_prefix18(self):
     # Triggers error due to invalid prefix
     ctx = Wtp()
     data = self.run_data("dsafjdasfkldjas: foo", ctx=ctx)
     self.assertEqual(data, {"related": [{"word": "foo"}]})
     self.assertNotEqual(ctx.debugs, [])
Example #13
0
        truncate_title = False
    if lst[-1] in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):
        truncate_title = False
    if not (len(lst) > 1 and any(x[0].isupper() for x in lst[1:]) and
            (len(lst[-1]) != 1 or lst[-1] not in ("virus",))):
        truncate_title = False

    if truncate_title:
        for i in range(1, len(lst)):
            if lst[i][0].isupper():
                title = " ".join(lst[:i])
                break

    return title

ctx = Wtp(cache_file="species-cache")
# Disable this for later runs to avoid recreating the cache.  Makes developing
# the code MUCH faster.  Remove the cache file before reading the dump.
# Read pages from the dump file into the cache file (Phase 1)
#list(ctx.process(dumpfile, page_handler, phase1_only=True))

# Process the pages in the dump file.
ret = list(ctx.reprocess(page_handler))

print("Count distinct titles:", len(set(ret)))
firsts = set(x.split()[0] for x in ret
             if x.find("virus") < 0 and x.find("satellite") < 0 and
             x.find("viroid") < 0)
print("Count distinct first words:", len(firsts))
words = set(x for title in ret for x in title.lower().split())
print("Count distinct latter:", len(words))
Example #14
0
        if node.kind not in (NodeKind.LEVEL2, NodeKind.LEVEL3,
                             NodeKind.LEVEL4, NodeKind.LEVEL5,
                             NodeKind.LEVEL6):
            continue
        if (len(node.args) != 1 or len(node.args[0]) != 1 or
            not isinstance(node.args[0][0], str)):
            print("  {} - {}: {}".format(title, node.kind, node.children))
            continue
        t = node.args[0][0]
        assert isinstance(t, str)
        print("  {} - {}".format(title, t))
        titles.append(t)
    sys.stdout.flush()
    return title, titles, ctx.errors

ctx = Wtp()
ret = ctx.process(path, page_handler)
counts = collections.defaultdict(int)
titles_ht = collections.defaultdict(list)
for page_title, titles, errors in ret:
    for title in titles:
        titles_ht[title].append(page_title)
    for err in errors:
        msg = err["msg"]
        counts[msg] += 1

print("=== MOST COMMON ERRORS")
errors = list(sorted(counts.items(), key=lambda x: x[1], reverse=True))
for err, cnt in errors[:40]:
    print(cnt, err)
 def setUp(self):
     self.maxDiff = 20000
     self.ctx = Wtp()
     self.config = WiktionaryConfig()
     self.ctx.start_page("abolitionism")  # Note: some tests use last char
     self.ctx.start_section("English")