Exemple #1
0
    result = []
    for id, text in getpages(bz2data):
        for line in text:
            if line.startswith("==") and not line.startswith("==="):
                lang = line[2:].strip()
                e = len(lang) - 1
                while e > 0 and lang[e] == '=':
                    e -= 1
                result.append((id, lang[:e + 1].strip()))
    return result


if __name__ == "__main__":
    import concurrent.futures

    target, spos, slen = mediawiki_parse.read()
    slen.pop()
    split = 10
    poslens = [(target, spos[i], sum(slen[i:i + split]))
               for i in range(1, len(slen), split)]

    results, langs = [], {}
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for result in executor.map(getlangs, poslens):
            for id, lang in result:
                if lang in langs:
                    lid = langs[lang]
                else:
                    lid = len(langs) + 1
                    langs[lang] = lid
                results.append((id, lid))
import mediawiki_parse

target, _, slen = mediawiki_parse.read()
getpages = mediawiki_parse.getpages
results, langs = [], {}
with open(target, "rb") as f:
    f.seek(slen[0])
    for length in slen[1:-1]:
        for id, text in getpages(f.read(length)):
            for line in text:
                if len(line) >= 3 and line[0] == '=' and line[
                        1] == '=' and line[2] != '=':
                    lang = line[2:].strip()
                    e = len(lang) - 1
                    while e > 0 and lang[e] == '=':
                        e -= 1
                    lang = lang[:e + 1].strip()
                    if lang in langs:
                        lid = langs[lang]
                    else:
                        lid = len(langs) + 1
                        langs[lang] = lid
                    results.append((id, lid))

with open("output1.tsv", "w", encoding="utf-8") as f:
    for id, lid in results:
        f.write(f"{id}\t{lid}\n")

with open("output2.tsv", "w", encoding="utf-8") as f:
    for k, v in langs.items():
        f.write(f"{v}\t{k}\n")