Beispiel #1
0
 def _extrair_termos(self):
     ctx = parse_wiktionary(
         os.path.join(self._local, 'wiktionary.xml.bz2'),
         word_cb = self.word_cb,
         capture_cb = self.capture_cb,
         languages=["Portuguese", "Translingual"],
         translations=False,
         pronunciations=False,
         redirects=False)
Beispiel #2
0
import wiktextract
# import json    # or `import simplejson as json` if on Python < 2.6
from neo4jcontroller import DbController

filePath = 'enwiktionary-20190501-pages-meta-current.xml'

db = DbController('bolt://localhost:7687', 'neo4j', 'h6u4%kr')


def processWord(data):
    print(data)
    if not "lang" in data.keys():
        return
    # obj = json.loads(data)
    db.createWordNode(data)


ctx = wiktextract.parse_wiktionary(path=filePath,
                                   word_cb=processWord,
                                   capture_cb=None,
                                   languages=["English", "Translingual"],
                                   translations=False,
                                   pronunciations=False,
                                   redirects=True)
def cb(data):
    word = data['word']
    if word not in all_data:
        all_data[word] = data
    else:
        word += '_'
    if count['c'] == 0:
        with open('test.json', 'w') as f:
            f.write(json.dumps(all_data))
    count['c'] = count['c'] + 1
    if count['c'] % 100 == 0:
        print(count['c'])


ctx = wiktextract.parse_wiktionary(path,
                                   cb,
                                   capture_cb=None,
                                   languages=["German"],
                                   translations=True,
                                   linkages=True,
                                   pronunciations=False,
                                   redirects=False)

import json

out = 'alles.json'

with open(out, 'w') as f:
    f.write(json.dumps(all_data))
Beispiel #4
0
import sys
import os

path = r'utils/enwiktionary-20200620-pages-articles-multistream.xml.bz2'
out_path = r'FO_inflection_data/wiktionary'

if not os.path.isdir(out_path):
    os.mkdir(out_path)

cnt = []


def word_cb(data):
    cnt.append('')
    filename = str(data.get('pos', 'no-pos')) + '.json'
    out_file = os.path.join(out_path, filename)
    print(f'{len(cnt)}\t{data}')
    # json.dump(data, sys.stdout,  ensure_ascii=False, indent=4)
    with open(out_file, 'a') as output:
        json.dump(data, output, ensure_ascii=False, indent=4)
        output.write('\n')


ctx = wiktextract.parse_wiktionary(path,
                                   word_cb,
                                   capture_cb=None,
                                   languages=["Faroese"],
                                   translations=False,
                                   pronunciations=False,
                                   redirects=False)
Beispiel #5
0
def main():
    words = {}
    wordlike = re.compile(r"[a-zA-Z']+")

    def word_cb(data):
        word = data['word']
        ipas = []

        if word.upper() in words:
            if word != word.upper():
                # We can use the capitalization from Wiktionary
                words[word] = words.pop(word.upper())
        elif word == word.upper():
            return  # We don't need to add a bunch of new acronyms
        else:
            match = wordlike.match(word)
            if not match:
                logger.error(f'what even is this: {word}')
                return  # What even is this
            elif match.span()[1] != len(word):
                logger.warning(f'skipping "{word}"')
                return  # We don't care about extended wiktionary entries

        if not 'pronunciations' in data:
            return  # We only care about entries with a pronunciation
        pronunciations = data['pronunciations']
        for pronunciation in pronunciations:
            if 'ipa' in pronunciation:
                ipa = re.sub('[.ˌˈ()]', '', pronunciation['ipa'][0][1][1:-1])
                append = False
                if 'accent' in pronunciation:
                    accent = pronunciation['accent']
                    if 'US' in accent or 'GenAm' in accent or 'GA' in accent:
                        append = True
                else:
                    append = True
                if append and ipa not in ipas:
                    ipas.append(ipa)

        if ipas:
            if word in words:
                for existing in words[word]:
                    try:
                        ipas.remove(existing)
                    except ValueError:
                        pass
                if ipas:
                    words[word].extend(ipas)
                    logger.debug(f'{word}: {words[word]}')

            else:
                words.update({word: ipas})
                logger.info(f'NEW {word}: {words[word]}')

    cmudict = 'cmudict-0.7b-ipa.txt'
    with open(cmudict, 'r') as infile:
        # There are 69 lines of symbols before the words
        for _ in range(69):
            next(infile)
        for line in infile:
            word, pronunciations = line.rstrip().split('\t')
            if '-' not in word:
                word = word.replace('.', '').rstrip()
                pronunciations = pronunciations.split(', ')
                pronunciations = [
                    re.sub('[.ˌˈ()]', '', x) for x in pronunciations
                ]
                words.update({word: pronunciations})

    path = 'enwiktionary-latest-pages-articles.xml'
    logger.debug(f'Processing {path}')
    ctx = wiktextract.parse_wiktionary(path=path,
                                       word_cb=word_cb,
                                       pronunciations=True)
    with open('dict.json', 'w') as outfile:
        json.dump(words, fp=outfile, indent=2)
Beispiel #6
0
    def test_long(self):
        # Just parse through the data and make sure that we find some words
        # This takes about 1.5 minutes.

        langs = collections.defaultdict(int)
        words = collections.defaultdict(int)
        poses = collections.defaultdict(int)
        num_transl = 0
        num_pron = 0
        num_conj = 0
        num_redirects = 0

        def word_cb(data):
            nonlocal num_transl
            nonlocal num_pron
            nonlocal num_conj
            nonlocal num_redirects
            word = data["word"]
            assert word
            words[word] += 1
            if "redirect" in data:
                assert isinstance(data["redirect"], str)
                num_redirects += 1
                return
            lang = data["lang"]
            pos = data["pos"]
            assert word and lang and pos
            langs[lang] += 1
            poses[pos] += 1
            if data.get("conjugation"):
                num_conj += 1
            if data.get("translations"):
                num_transl += 1
            sounds = data.get("pronunciations", ())
            if sounds and any("ipa" in x for x in sounds):
                num_pron += 1

        path = "wiktextract/tests/test-pages-articles.xml.bz2"
        print("Parsing test data")
        wiktextract.parse_wiktionary(
            path,
            word_cb,
            languages=["English", "Finnish", "Translingual"],
            translations=True,
            pronunciations=True,
            linkages=True,
            compounds=True,
            redirects=True)
        print("Test data parsing complete")
        assert num_redirects > 0
        assert len(words) > 100
        assert all(x < 50 for x in words.values())
        assert langs["English"] > 0
        assert langs["Finnish"] > 0
        assert langs["Translingual"] > 0
        assert len(langs.keys()) == 3
        assert len(poses.keys()) <= len(wiktextract.PARTS_OF_SPEECH)
        assert sum(poses.values()) == sum(langs.values())
        assert sum(words.values()) == sum(poses.values()) + num_redirects
        assert num_conj > 0
        assert num_transl > 0
        assert num_pron > 0
Beispiel #7
0
with open(xml_fn, 'wb') as handle:
    for block in response.iter_content(4096):
        handle.write(block)

print("Downloaded XML dump, beginning processing...")

fh = open("output.json", "wb")


def word_cb(data):
    fh.write(json.dumps(data))


ctx = wiktextract.parse_wiktionary(r'how-to-use-wiktextract',
                                   word_cb,
                                   languages=["English", "Translingual"])

print("{} English entries processed.".format(ctx.language_counts["English"]))
print("{} bytes written to output.json".format(fh.tell()))

fh.close()

#import wiktextract
# path_output = "./output"

# ctx = wiktextract.parse_wiktionary(
#     path_output, word_cb=,
#     capture_cb=None,
#     languages=["English", "Translingual"],
#     translations=False,
import wiktextract as wk
import dataset as ds

DB = ds.connect('sqlite:///german.db')

NOUNS = DB['noun']


def add_noun(word, conjugation):
    if "n" in conjugation and conjugation["n"] == 'sg':
        data = NOUNS.find_one(word=word)
        if data:
            data["plural_ending"] = None
            data["plural"] = None
            NOUNS.update(data, ["word"])
            print("found one: " + word)


def word_cb(data):
    if 'conjugation' in data:
        if data['pos'] in ['noun', 'name']:
            for conjugation in data['conjugation']:
                add_noun(data['word'], conjugation)


wk.parse_wiktionary('enwiktionary.xml.bz2', word_cb, languages=['German'])
print("Yay, all parsed")