def _extrair_termos(self): ctx = parse_wiktionary( os.path.join(self._local, 'wiktionary.xml.bz2'), word_cb = self.word_cb, capture_cb = self.capture_cb, languages=["Portuguese", "Translingual"], translations=False, pronunciations=False, redirects=False)
import wiktextract # import json # or `import simplejson as json` if on Python < 2.6 from neo4jcontroller import DbController filePath = 'enwiktionary-20190501-pages-meta-current.xml' db = DbController('bolt://localhost:7687', 'neo4j', 'h6u4%kr') def processWord(data): print(data) if not "lang" in data.keys(): return # obj = json.loads(data) db.createWordNode(data) ctx = wiktextract.parse_wiktionary(path=filePath, word_cb=processWord, capture_cb=None, languages=["English", "Translingual"], translations=False, pronunciations=False, redirects=True)
def cb(data): word = data['word'] if word not in all_data: all_data[word] = data else: word += '_' if count['c'] == 0: with open('test.json', 'w') as f: f.write(json.dumps(all_data)) count['c'] = count['c'] + 1 if count['c'] % 100 == 0: print(count['c']) ctx = wiktextract.parse_wiktionary(path, cb, capture_cb=None, languages=["German"], translations=True, linkages=True, pronunciations=False, redirects=False) import json out = 'alles.json' with open(out, 'w') as f: f.write(json.dumps(all_data))
import sys import os path = r'utils/enwiktionary-20200620-pages-articles-multistream.xml.bz2' out_path = r'FO_inflection_data/wiktionary' if not os.path.isdir(out_path): os.mkdir(out_path) cnt = [] def word_cb(data): cnt.append('') filename = str(data.get('pos', 'no-pos')) + '.json' out_file = os.path.join(out_path, filename) print(f'{len(cnt)}\t{data}') # json.dump(data, sys.stdout, ensure_ascii=False, indent=4) with open(out_file, 'a') as output: json.dump(data, output, ensure_ascii=False, indent=4) output.write('\n') ctx = wiktextract.parse_wiktionary(path, word_cb, capture_cb=None, languages=["Faroese"], translations=False, pronunciations=False, redirects=False)
def main(): words = {} wordlike = re.compile(r"[a-zA-Z']+") def word_cb(data): word = data['word'] ipas = [] if word.upper() in words: if word != word.upper(): # We can use the capitalization from Wiktionary words[word] = words.pop(word.upper()) elif word == word.upper(): return # We don't need to add a bunch of new acronyms else: match = wordlike.match(word) if not match: logger.error(f'what even is this: {word}') return # What even is this elif match.span()[1] != len(word): logger.warning(f'skipping "{word}"') return # We don't care about extended wiktionary entries if not 'pronunciations' in data: return # We only care about entries with a pronunciation pronunciations = data['pronunciations'] for pronunciation in pronunciations: if 'ipa' in pronunciation: ipa = re.sub('[.ˌˈ()]', '', pronunciation['ipa'][0][1][1:-1]) append = False if 'accent' in pronunciation: accent = pronunciation['accent'] if 'US' in accent or 'GenAm' in accent or 'GA' in accent: append = True else: append = True if append and ipa not in ipas: ipas.append(ipa) if ipas: if word in words: for existing in words[word]: try: ipas.remove(existing) except ValueError: pass if ipas: words[word].extend(ipas) logger.debug(f'{word}: {words[word]}') else: words.update({word: ipas}) logger.info(f'NEW {word}: {words[word]}') cmudict = 'cmudict-0.7b-ipa.txt' with open(cmudict, 'r') as infile: # There are 69 lines of symbols before the words for _ in range(69): next(infile) for line in infile: word, pronunciations = line.rstrip().split('\t') if '-' not in word: word = word.replace('.', '').rstrip() pronunciations = pronunciations.split(', ') pronunciations = [ re.sub('[.ˌˈ()]', '', x) for x in pronunciations ] words.update({word: pronunciations}) path = 'enwiktionary-latest-pages-articles.xml' logger.debug(f'Processing {path}') ctx = wiktextract.parse_wiktionary(path=path, word_cb=word_cb, pronunciations=True) with open('dict.json', 'w') as outfile: json.dump(words, fp=outfile, indent=2)
def test_long(self): # Just parse through the data and make sure that we find some words # This takes about 1.5 minutes. langs = collections.defaultdict(int) words = collections.defaultdict(int) poses = collections.defaultdict(int) num_transl = 0 num_pron = 0 num_conj = 0 num_redirects = 0 def word_cb(data): nonlocal num_transl nonlocal num_pron nonlocal num_conj nonlocal num_redirects word = data["word"] assert word words[word] += 1 if "redirect" in data: assert isinstance(data["redirect"], str) num_redirects += 1 return lang = data["lang"] pos = data["pos"] assert word and lang and pos langs[lang] += 1 poses[pos] += 1 if data.get("conjugation"): num_conj += 1 if data.get("translations"): num_transl += 1 sounds = data.get("pronunciations", ()) if sounds and any("ipa" in x for x in sounds): num_pron += 1 path = "wiktextract/tests/test-pages-articles.xml.bz2" print("Parsing test data") wiktextract.parse_wiktionary( path, word_cb, languages=["English", "Finnish", "Translingual"], translations=True, pronunciations=True, linkages=True, compounds=True, redirects=True) print("Test data parsing complete") assert num_redirects > 0 assert len(words) > 100 assert all(x < 50 for x in words.values()) assert langs["English"] > 0 assert langs["Finnish"] > 0 assert langs["Translingual"] > 0 assert len(langs.keys()) == 3 assert len(poses.keys()) <= len(wiktextract.PARTS_OF_SPEECH) assert sum(poses.values()) == sum(langs.values()) assert sum(words.values()) == sum(poses.values()) + num_redirects assert num_conj > 0 assert num_transl > 0 assert num_pron > 0
with open(xml_fn, 'wb') as handle: for block in response.iter_content(4096): handle.write(block) print("Downloaded XML dump, beginning processing...") fh = open("output.json", "wb") def word_cb(data): fh.write(json.dumps(data)) ctx = wiktextract.parse_wiktionary(r'how-to-use-wiktextract', word_cb, languages=["English", "Translingual"]) print("{} English entries processed.".format(ctx.language_counts["English"])) print("{} bytes written to output.json".format(fh.tell())) fh.close() #import wiktextract # path_output = "./output" # ctx = wiktextract.parse_wiktionary( # path_output, word_cb=, # capture_cb=None, # languages=["English", "Translingual"], # translations=False,
import wiktextract as wk import dataset as ds DB = ds.connect('sqlite:///german.db') NOUNS = DB['noun'] def add_noun(word, conjugation): if "n" in conjugation and conjugation["n"] == 'sg': data = NOUNS.find_one(word=word) if data: data["plural_ending"] = None data["plural"] = None NOUNS.update(data, ["word"]) print("found one: " + word) def word_cb(data): if 'conjugation' in data: if data['pos'] in ['noun', 'name']: for conjugation in data['conjugation']: add_noun(data['word'], conjugation) wk.parse_wiktionary('enwiktionary.xml.bz2', word_cb, languages=['German']) print("Yay, all parsed")