Beispiel #1
0
def load_wiktionary():
    global wiktionary
    if wiktionary is not None:
        return
    try:
        wiktionary = set([
            x.lower() for x in json_load(script_path("wiktionary_lemmas.json"))
        ])
    except:
        print("run python -m natas.download")
        wiktionary = []
Beispiel #2
0
def _load_transducer(filename, invert):
    metadata_filename = os.path.join(os.path.dirname(filename),
                                     "metadata.json")
    try:
        metadata = mikatools.json_load(metadata_filename)
    except:
        #No crash if JSON is not found or malformed for some reason
        metadata = {}
    if "fst_type" in metadata and metadata["fst_type"] == "foma":
        return FomaFSTWrapper(filename, invert)
    else:
        input_stream = hfst.HfstInputStream(filename)
        return input_stream.read()
Beispiel #3
0
from mikatools import script_path, json_load
from onmt.translate.translator import Translator
from onmt.decoders.ensemble import load_test_model
from onmt.translate import GNMTGlobalScorer
from itertools import islice, repeat
import configargparse as cfargparse
import spacy
import os


wiktionary = set([x.lower() for x in json_load(script_path("wiktionary_lemmas.json"))])

is_in_data_cache = {"ceec_eng":{}, "ocr_fin":{}}

def set_spacy(nlp):
	models["spacy"] = nlp

def _get_spacy():
	if "spacy" not in models:
		try:
			models["spacy"] = spacy.load('en_core_web_md')
		except IOError:
			raise Exception("Spacy model was not loaded! Run: python -m spacy download en_core_web_md")
	return models["spacy"]

def split_corpus(f, shard_size):
    if shard_size <= 0:
        yield f
    else:
        while True:
            shard = list(islice(f, shard_size))
Beispiel #4
0
def model_info(language):
    filename = os.path.join(__where_models(language), "metadata.json")
    d = mikatools.json_load(filename)
    mikatools.print_json_help(d)
Beispiel #5
0
#encoding: utf-8
from __future__ import unicode_literals
import re, unicodedata
import mikatools

isos = mikatools.json_load(mikatools.script_path("lang_codes.json"))

pattern = re.compile(
    r'(\w[\u02F3\u0300\u2013\u032E\u208D\u203F\u0311\u0323\u035E\u031C\u02FC\u030C\u02F9\u0328\u032D:\u02F4\u032F\u0330\u035C\u0302\u0327\u03572\u0308\u0351\u0304\u02F2\u0352\u0355\u00B7\u032C\u030B\u2019\u0339\u00B4\u0301\u02F1\u0303\u0306\u030A7\u0325\u0307\u0354`\u02F0]+|\w|\W)',
    re.UNICODE | re.IGNORECASE)


def char_split(word):
    word = unicodedata.normalize('NFKC', word)
    _result = pattern.findall(word)
    return list(_result)


def filter_arabic(text, keep_vowels=True, combine_by=""):
    if keep_vowels:
        return combine_by.join(re.findall(r"[ء-ي'ًٌٍَُِّْـ']+", text))
    else:
        return combine_by.join(re.findall(r"[ء-ي]+", text))


def iso_to_name(iso):
    return isos[iso]