def _load_model(name): model_path = script_path("models/" + name) if not os.path.exists(model_path): model_path = name opt = opennmt_opts(model_path, **_default_kwargs()) m = load_test_model(opt) models[name] = m
def load_wiktionary(): global wiktionary if wiktionary is not None: return try: wiktionary = set([ x.lower() for x in json_load(script_path("wiktionary_lemmas.json")) ]) except: print("run python -m natas.download") wiktionary = []
from mikatools import script_path, json_load from onmt.translate.translator import Translator from onmt.decoders.ensemble import load_test_model from onmt.translate import GNMTGlobalScorer from itertools import islice, repeat import configargparse as cfargparse import spacy import os wiktionary = set([x.lower() for x in json_load(script_path("wiktionary_lemmas.json"))]) is_in_data_cache = {"ceec_eng":{}, "ocr_fin":{}} def set_spacy(nlp): models["spacy"] = nlp def _get_spacy(): if "spacy" not in models: try: models["spacy"] = spacy.load('en_core_web_md') except IOError: raise Exception("Spacy model was not loaded! Run: python -m spacy download en_core_web_md") return models["spacy"] def split_corpus(f, shard_size): if shard_size <= 0: yield f else: while True: shard = list(islice(f, shard_size))
#encoding: utf-8 from __future__ import unicode_literals import re, unicodedata import mikatools isos = mikatools.json_load(mikatools.script_path("lang_codes.json")) pattern = re.compile( r'(\w[\u02F3\u0300\u2013\u032E\u208D\u203F\u0311\u0323\u035E\u031C\u02FC\u030C\u02F9\u0328\u032D:\u02F4\u032F\u0330\u035C\u0302\u0327\u03572\u0308\u0351\u0304\u02F2\u0352\u0355\u00B7\u032C\u030B\u2019\u0339\u00B4\u0301\u02F1\u0303\u0306\u030A7\u0325\u0307\u0354`\u02F0]+|\w|\W)', re.UNICODE | re.IGNORECASE) def char_split(word): word = unicodedata.normalize('NFKC', word) _result = pattern.findall(word) return list(_result) def filter_arabic(text, keep_vowels=True, combine_by=""): if keep_vowels: return combine_by.join(re.findall(r"[ء-ي'ًٌٍَُِّْـ']+", text)) else: return combine_by.join(re.findall(r"[ء-ي]+", text)) def iso_to_name(iso): return isos[iso]
import tensorflow as tf from .utils.math import * from .utils.bdi import * from .utils.model import * from . import pickle2 as pickle from .dan_eval import SentiDAN from mikatools import script_path config = tf.ConfigProto() sess = tf.Session(config=config) cnn = SentiDAN(sess) cnn.load(script_path('senti_model.bin')) infile = script_path('checkpoints/en-es-bimap-1.bin') MAX_LEN = 64 N = 5 dic = load_model(infile) W_src = dic['W_source'] W_trg = dic['W_target'] src_lang = dic['source_lang'] trg_lang = dic['target_lang'] model = dic['model'] with open(script_path('pickle/%s.bin' % src_lang), 'rb') as fin: src_wv = pickle.load(fin) with open(script_path('pickle/%s.bin' % trg_lang), 'rb') as fin: trg_wv = pickle.load(fin) src_pad_id = src_wv.add_word('<pad>', np.zeros(src_wv.vec_dim, dtype=np.float32)) trg_pad_id = trg_wv.add_word('<pad>', np.zeros(trg_wv.vec_dim, dtype=np.float32)) src_proj_emb = np.empty(src_wv.embedding.shape, dtype=np.float32)