Beispiel #1
0
    def runonce(self, num_threads):
        # Just parse through the data and make sure that we find some words
        path = "tests/test-pages-articles.xml.bz2"
        print("Parsing test data")
        ctx = Wtp(num_threads=num_threads)
        ret = ctx.process(path, page_cb)
        titles = collections.defaultdict(int)
        redirects = collections.defaultdict(int)
        for title, redirect_to in ret:
            titles[title] += 1
            if redirect_to is not None:
                redirects[redirect_to] += 1

        print("Test data parsing complete")
        assert sum(redirects.values()) > 0
        assert len(titles) > 100
        assert all(x == 1 for x in titles.values())
        assert len(redirects) > 1
Beispiel #2
0
                             NodeKind.LEVEL4, NodeKind.LEVEL5,
                             NodeKind.LEVEL6):
            continue
        if (len(node.args) != 1 or len(node.args[0]) != 1 or
            not isinstance(node.args[0][0], str)):
            print("  {} - {}: {}".format(title, node.kind, node.children))
            continue
        t = node.args[0][0]
        assert isinstance(t, str)
        print("  {} - {}".format(title, t))
        titles.append(t)
    sys.stdout.flush()
    return title, titles, ctx.errors

ctx = Wtp()
ret = ctx.process(path, page_handler)
counts = collections.defaultdict(int)
titles_ht = collections.defaultdict(list)
for page_title, titles, errors in ret:
    for title in titles:
        titles_ht[title].append(page_title)
    for err in errors:
        msg = err["msg"]
        counts[msg] += 1

print("=== MOST COMMON ERRORS")
errors = list(sorted(counts.items(), key=lambda x: x[1], reverse=True))
for err, cnt in errors[:40]:
    print(cnt, err)

print("=== Saving non-language titles in temp-nonlangs.json")