def viz(sents, trans_it=True): nlp = get_nlp(self.lang) doc = nlp(sents) cv = CoreNlpViz(shape='egg', size='8,5', fontsize=20) words = [ word.text for sent in doc.sentences for word in sent.words ] if trans_it: rs_trans = self.trans_to(sents, ['en']) if outf is not None: for r in rs_trans: outf(r) else: print(*rs_trans, sep='\n') if self.lang in translits.available_langs(): if outf is not None: outf('♡ ' + translits.translit(sents, self.lang)) else: print('♡', translits.translit(sents, self.lang)) tr_map, tr_tab = get_word_map(self.lang, 'en', sents, 0, words, local_translit=local_translit) if outf is not None: outf(' '.join(tr_tab)) else: tr_map = None return cv.analyse_doc(doc, tr_map)
def nouns(self, phrase: Text): """ $ python -m sagas.ko.ko_helper nouns '피자와 스파게티가' $ python -m sagas.ko.ko_helper nouns '계획이' :param phrase: :return: """ from sagas.nlu.transliterations import translits from sagas.ko.kwn_procs import kwn ns = self.mecab.nouns(phrase) rs = [] for w in ns: # ws = get_word_sets(w, 'ko') ws = kwn.get_synsets(w, first=True) if ws: rs.append({ 'spec': ws[0].name(), 'text': w, 'translit': translits.translit(w, 'ko'), 'definition': ws[0].definition() }) else: rs.append({ 'text': w, 'translit': translits.translit(w, 'ko'), }) return rs
def build_kwn(lookups_nor, lookups_tra): rs = [] import pandas as pd data_path = f'{cf.conf_dir}/ai/nltk/kwn_1.0/kwn_synset_list.tsv' df = pd.read_csv(data_path, sep='\t') for index, row in df.iterrows(): ko_lemmas = row['korean_lemmas'].split(', ') refid = row['# synset_id'] offset, pos = refid.split('-') syn = wn.synset_from_pos_and_offset(pos, int(offset)) en_lemmas = str(row['english_lemmas']).split(', ') rs.append({ 'id': refid, 'en': en_lemmas, 'ko': ko_lemmas, 'translit': [translits.translit(le, 'ko') for le in ko_lemmas], 'name': syn.name(), 'definition': syn.definition() }) for kw in ko_lemmas: if kw not in lookups_nor: lookups_nor[kw] = {syn.name()} lookups_tra[translits.translit(kw, 'ko')] = {syn.name()} else: lookups_nor[kw].add(syn.name()) lookups_tra[translits.translit(kw, 'ko')].add(syn.name()) return rs
def translit_chunk(chunk:str, lang): from sagas.nlu.transliterations import translits # if upos=='PUNCT': # return chunk if chunk.strip() in (',','.',';','?','!'): return chunk # if lang in ('ko', 'ja', 'fa', 'hi', 'ar'): if translits.is_available_lang(lang): if sa_env.runtime!='default': return word.text+'\n'+translits.translit(chunk, lang) return translits.translit(chunk, lang) return chunk
def word_values(word: Text, lang: Text): from sagas.nlu.transliterations import translits if '/' in word: text, lemma=word.split('/') else: text=lemma=word if translits.is_available_lang(lang): try: text_val=translits.translit(text, lang) return {'value':word, 'text':text_val, 'lemma':translits.translit(lemma, lang) if lemma.strip()!='' else text_val} except ValueError: print(f'*** value error: text: {text}, lemma: {lemma}') return {'value':word, 'text':text, 'lemma':lemma}
def build_omw(lookups_nor, lookups_tra): import pandas as pd data_path = f'{cf.conf_dir}/ai/nltk/data/wikt/wn-wikt-kor.tab' df = pd.read_csv(data_path, sep='\t') for index, row in df.iterrows(): refid = row['# Wiktionary'] kw = row['http://wiktionary.org/'] offset, pos = refid.split('-') syn = wn.synset_from_pos_and_offset(pos, int(offset)) if kw not in lookups_nor: lookups_nor[kw] = {syn.name()} lookups_tra[translits.translit(kw, 'ko')] = {syn.name()} else: lookups_nor[kw].add(syn.name()) lookups_tra[translits.translit(kw, 'ko')].add(syn.name())
def search_in(items, phrase): for item in items: if words in item[0]: print(item) # print(f"{l[1]} -> {l[0]}") print(translits.translit(item[1], lang)) print('.. done.')
def get_verb_interr(c: DomainToken, part: Text): from sagas.nlu.inspectors_dataset import get_interrogative from sagas.nlu.transliterations import translits word = translits.translit(c.text.split('/')[0], 'ko') rep = get_interrogative(word, 'ko') if rep: return 4, f"interr_root('{rep}')" else: return 4, "interr_root('??')"
def ex_translit(key: Text, cnt: Text, comp: Text, ctx: cla_meta_intf): from sagas.nlu.transliterations import translits if translits.is_available_lang(ctx.lang): tval = translits.translit(cnt, ctx.lang) # tval=tval.replace('[UNK]', '').strip() ctx.add_result(extractor, comp, key, tval) else: ctx.add_result(extractor, comp, key, cnt) return True
def __call__(self, sents:Text): from sagas.nlu.stanza_helper import get_nlp from sagas.nlu.transliterations import translits preprocs={'sr':lambda : translits.translit(sents, self.lang), } nlp = get_nlp(self.lang) if self.lang in preprocs: sents=preprocs[self.lang]() doc = nlp(sents) return StanzaSentImpl(doc.sentences[0], text=sents)
def get_contrast(word:Text, source:Text, target:Text='en', ips_idx=0): from sagas.nlu.transliterations import translits from sagas.nlu.constants import contrast_translit_langs if source==target: return word options = {'get_pronounce', 'disable_correct'} local_translit = True if source in contrast_translit_langs else False res, t = translate_try(word, source=source, target=target, options=options) if local_translit and translits.is_available_lang(source): trans = ', ' + translits.translit(word, source) else: trans = marks(t, ips_idx) return res+trans
def run(self, key, ctx:Context): from sagas.nlu.inspectors_dataset import negative_maps from sagas.nlu.inspectors_dataset import translit_langs from sagas.nlu.transliterations import translits lang=ctx.meta['lang'] if lang in negative_maps: data_map=negative_maps[lang] if lang in translit_langs: word_val=translits.translit(ctx.words[key], lang) else: word_val=ctx.lemmas[key] if ctx.chunk_contains(key, data_map) or word_val in data_map: return True return False
def translit(self, word): """ $ python -m sagas.ko.ko_helper translit '피자와 스파게티가' See also: procs-ko-konlpy.ipynb :param word: :return: """ from sagas.nlu.transliterations import translits for w, p in self.mecab.pos(word): expl = '_' if p in ('NNG', 'VV'): ws = get_word_sets(w, 'ko') if ws: expl = f"{ws['name']}({ws['definition']})" print(w, translits.translit(w, 'ko'), p, expl)
def trans_en_to(self, text, lang, translit_targets=None, said=True): import clipboard from sagas.nlu.transliterations import translits source = 'en' targets = f'fr;zh-CN;{lang}' says = lang # details=False ctx = TransContext(source, targets, text, says, '') ctx.sents_map[source[:2]] = text succ = self.translators[self.translator](ctx) if not succ: return # addons, result = self.parse_chunks(text, source, targets, ctx, details=details) addons = [] # result = '\n\t'.join([text] + ctx.target_sents) lines = [] lines.append(f'\t.sent({source}="{text}"') suffix = ") \\" # other appendants like: ctx.target_sents.append(f'v{i}="{ps}"') # add translits if translit_targets is not None: for i, translit in enumerate(translit_targets): ps = translits.translit(ctx.sents_map[translit], translit) ctx.target_sents.append(f't{i}="{ps}"') result = ', \n\t '.join(lines + ctx.target_sents + [suffix]) print(result) clipboard.copy(result) if said: from sagas.nlu.nlu_tools import NluTools NluTools().say(ctx.sents_map[says], says)
def get_word_map(source, target, text, ips_idx=0, words=None, local_translit=False): """ Example 1: from sagas.nlu.corenlp_helper import CoreNlp, CoreNlpViz, get_nlp ana=lambda sents: CoreNlpViz().analyse(sents, get_nlp('hi'), get_word_map('hi','en', sents)[0]) ana('मेरे पास दो रेफ्रिजरेटर हैं') Example 2: get_word_map('hi','en', 'मेरे पास दो रेफ्रिजरेटर')[0] :param source: :param target: :param text: :param ips_idx: :return: """ from sagas.nlu.transliterations import translits rs = {} verbose = False options = {'get_pronounce', 'disable_correct'} if words is None: words=text.split(' ') trans_table=[] for sent in words: res, t = translate_try(sent, source=source, target=target, options=options) # print(res, sent, t[ips_idx]) if local_translit and translits.is_available_lang(source): trans=', '+translits.translit(sent, source) else: trans=marks(t, ips_idx) rs[sent] = '%s\n(%s%s)' % (sent, res, trans) res_r=f"({res})" if res!='' and res not in ('(', ')', '[', ']', '/') else '' trans_table.append(f"{trans[2:]}{res_r}") return rs, trans_table
def trans_val(cnt, lang): from sagas.nlu.transliterations import translits if lang in translit_langs: # index 0 is word, 1 is lemma return translits.translit(cnt.split('/')[0], lang) return cnt.split('/')[-1].lower()
def translit_text(text, lang) -> Text: from sagas.nlu.transliterations import translits if translits.is_available_lang(lang): return translits.translit(text, lang) return text
from typing import Text, Any, Dict, List, Union from sagas.nlu.transliterations import translits from sagas.nlu.translator import translate, with_words, WordsObserver import logging logger = logging.getLogger(__name__) tr = lambda w: translits.translit(w, 'hi') def trans(w, pos: Text): r, t = translate(w, source='hi', target='en', options={'get_pronounce'}, tracker=with_words()) # df=t.observer(WordsObserver).word_trans_df # if df is not None: # candidates=[w for w in df['word']][:3] # else: # candidates=[] obs: WordsObserver = t.observer(WordsObserver) candidates = obs.get_candidates(pos) return {'word': obs.get_axis(r.lower(), pos), 'candidates': candidates} def word_map(id: int, all_ws: List[Any], pos: Text) -> Dict[Text, Any]: w = next((w for w in all_ws if w.synset_id() == id), None) if w: return { 'synset': tr(w.head_word()),
def translit_chunk(chunk, lang): from sagas.nlu.transliterations import translits # if lang in ('ko', 'ja', 'fa', 'hi', 'ar'): if translits.is_available_lang(lang): return '/' + translits.translit(chunk, lang) return ''