def get_corpus(lang:Text, chapter:Text): dfjson = pd.read_json(f'~/pi/stack/crawlers/langcrs/all_{lang}.json') ch=dfjson[dfjson['chapter'].str.match(chapter)] rs=[] for i, (sent,ref) in enumerate(zip(ch['translate'], ch['text'])): rs.append(fix_sents(sent, lang)) return rs
def build_anal_tree(sents: Text, lang: Text, engine: Text, nodecls=None, docimpl=None): """ >>> from sagas.nlu.anal import build_anal_tree >>> from anytree.search import findall, findall_by_attr >>> f=build_anal_tree(sents, lang, engine) >>> words = findall_by_attr(f, name='upos', value='VERB') >>> objs = findall(words[0], filter_=lambda n: n.dependency_relation in ("obj")) :param sents: :param lang: :param engine: :return: """ from sagas.nlu.uni_remote import dep_parse from sagas.nlu.utils import fix_sents sents = fix_sents(sents, lang) # doc is SentenceIntf doc, resp = dep_parse(sents, lang=lang, engine=engine, pipelines=['predicts'], doc_impl=docimpl) predicts = resp['predicts'] if resp and 'predicts' in resp else [] return from_doc(doc, lang, engine, nodecls, predicts)
def get_records(lang, chapter, field, fix=False): from sagas.tool.intents_tool import intents_tool text_list = [( doc[field] if not fix else fix_sents(doc[field], lang), doc['text'] if lang != 'en' else f"{doc['chapter'].lower()[:10]}_{doc['index']}", doc['intent'] if 'intent' in doc else '', ) for doc in intents_tool.db.corpus.find({'chapter': chapter})] return text_list
def parse_comps(sents, source): sents = fix_sents(sents, source) engine = cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) if len(resp['predicts']) > 0: rs = resp['predicts'] else: rs = get_chunks(doc_jsonify) return rs
def parse_deps(text, lang, translit=None): text = fix_sents(text, lang) engine = cf.engine(lang) # g = sentence_view(lang, text, engine=engine, translit_lang=lang, enable_contrast=True) doc_jsonify, resp = dep_parse(text, lang, engine, ['predicts']) if doc_jsonify is not None: list_chunks(doc_jsonify, resp, lang, enable_contrast=True) g = display_doc_deps(doc_jsonify, resp, translit_lang=lang) st.graphviz_chart(g) if translit is not None: st.text(f"♤ {translit}") words = [word.text for word in doc_jsonify.words] tools.contrast(text, lang, word_map=words)
def proc_corpus(lang, chapter): """ >>> proc_corpus('ko', 'At school') :param lang: :param chapter: :return: """ dfjson = pd.read_json(f'~/pi/stack/crawlers/langcrs/all_{lang}.json') ch = dfjson[dfjson['chapter'].str.match(chapter)] rs_map = {} for i, (sent, ref) in enumerate(zip(ch['translate'], ch['text'])): text = fix_sents(sent, lang) print('-', i, ref, '-' * 10) # digest(text, 'spacy') rs_map[i] = digest_verb(text, lang, 'stanza') return len(ch), len([t for t in rs_map.values() if t])
def list_chapter_text(self, lang, chapter, fix=False): """ >>> intents_tool.list_chapter_text('fr', 'People', True) >>> intents_tool.list_chapter_text('en', 'People') :param lang: :param chapter: :param fix: :return: """ field = f'lang_{lang}' if lang != 'en' else 'text' print(*[ doc[field] if not fix else fix_sents(doc[field], lang) for doc in self.db.corpus.find({'chapter': chapter}) ], sep='\n')
def clip_parse(self, source, sents='', specified='default', do_test=False): """ >> clip text: یک آبجو مى خواهم. $ nlu clip_parse fa $ engine='stanford' nluc ar $ nlu clip_parse fi 'Tuolla ylhäällä asuu vanha nainen.' $ nluc nl 'De vrouw heeft verschillende appels.' $ nluc id 'Ini adalah judul buku yang saya baca.' aux $ nluc fi 'Voiko täältä lainata aurinkovarjoa?' default True :param source: :return: """ from sagas.nlu.uni_remote import dep_parse from sagas.nlu.common import get_from_clip from sagas.conf.conf import cf from sagas.nlu.uni_remote_viz import list_chunks from sagas.nlu.utils import fix_sents if sents=='': sents = get_from_clip() if sents.strip()=='': tc.info('no text avaliable in clipboard.') return sents=fix_sents(sents, source) tc.info(sents) # Parse the sentence and display it's chunks, domains and contrast translations. engine=cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) if doc_jsonify is None: raise Exception(f'Cannot parse sentence for lang {source}') list_chunks(doc_jsonify, resp, source, enable_contrast=True, specified=None if specified=='default' else specified) words = [word.text for word in doc_jsonify.words] self.contrast(sents, source, word_map=words) ## visual tree self.main_domains(sents, source, engine, False) ## add rulesets procs from sagas.nlu.inferencer import do_infers cli_cmd, pats = do_infers(sents, source) if do_test: for pat in pats: self.check_rule(sents, source, pat)
def sents_summary(sents, source): from sagas.nlu.uni_remote import dep_parse from sagas.nlu.uni_remote_viz import list_contrast from sagas.conf.conf import cf from sagas.nlu.utils import fix_sents from sagas.nlu.uni_parser import get_chunks sents=fix_sents(sents, source) engine=cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) types=[] if doc_jsonify is None: raise Exception(f'Cannot parse sentence for lang {source}') if len(resp['predicts']) > 0: rs=resp['predicts'] else: rs = get_chunks(doc_jsonify) for serial, r in enumerate(rs): print(f"{serial}. {r['type']} -> {r['word']}") types.append(f"{source}:{r['type']}") list_contrast(rs, source) return types
def parse_sents(data: Dict): sents, source, engine = data['sents'], data['lang'], data['engine'] sents = fix_sents(sents, source) # engine=cf.engine(source) return parse_and_cache(sents, source, engine)
def fix_data(data): if 'engine' not in data: data['engine'] = cf.engine(data['lang']) data['sents']=fix_sents(data['sents'], data['lang']) return data
def trans_clip(self, source='auto', targets='zh-CN;ja', says=None, details=True, sents='', deps=''): """ $ trans $ trans auto en $ trans ru en $ trans ru 'zh-CN;ja' $ trans-ru $ trans-rus $ alias sp="python -m sagas.tool.misc trans_clip pt 'en;it;ja' ja False" $ sp 'O homem fica amarelo.' $ sa 'أنا متأسف.' $ sf "La similitude entre ces deux phrases" 'ja;zh;id' $ sz '这两句话的相似程度' en $ sz '这两句话的相似程度' 'en;fr;ar;ja;fa' $ engine=spacy se 'I like to eat cucumber.' $ sj '足にひどい痛みを感じました。' # multiple predicates :return: """ import clipboard from sagas.nlu.nlu_cli import NluCli from sagas.nlu.nlu_tools import NluTools ascii_incompatibles = ['zh', 'ja', 'ko', 'ar', 'fa'] if sents != '': text = sents interact_mode = False else: text = clipboard.paste() text = text.replace("\n", "") interact_mode = True # remove spaces if lang is ja/zh # if source in ('ja','zh'): # text=text.replace(' ','') text = fix_sents(text, source) engine = cf.engine(source) tc.emp('yellow', f".. parse with {engine}: ({text})") # events from sagas.nlu.events import init_reps init_reps() # add at 2019.9.15 ascii_gs = [] if self.enable_ascii_viz: rt = NluCli().ascii_viz(text, source, engine=engine) if source not in ascii_incompatibles: ascii_gs.extend(rt.split('\n')) print(rt) # target_sents=[] # sents_map={} ctx = TransContext(source, targets, text, says, deps) # print('❣', text) if source != 'auto': # text = fix_sents(source, text) ctx.sents_map[source[:2]] = text # addi_pronounce=[] # succ=self.trans_google(ctx) succ = self.translators[self.translator](ctx) if not succ: return # if len(addi_pronounce)>0: # target_sents.extend(addi_pronounce) ## addons if self.enable_chunks_parse: addons, result = self.parse_chunks(text, source, targets, ctx, details=details) else: addons = [] # result = '\n\t'.join([text] + ctx.target_sents) lines = [] lines.append(f'\t.sent({source}="{text}"') suffix = ") \\" result = ', \n\t '.join(lines + ctx.target_sents + [suffix]) print(result) # other langs dep-parse if self.enable_ascii_viz and deps != '': for t in deps.split(';'): if t in ctx.sents_map: rt = NluCli().ascii_viz(ctx.sents_map[t], t, engine=cf.engine(t)) # ascii_gs.extend(rt.split('\n')) print(rt) else: color_print( 'red', f".. the lang {t} for dep-parse is not available in translated list." ) # do infers from sagas.nlu.inferencer import do_infers cli_cmd, pats = do_infers(text, source) # tools tools = NluTools() # if cf.is_enabled('print_tree'): # move to events # tools.main_domains(text, lang=source, engine=engine, print_domains=False) # copy to clipboard if interact_mode: result = result + '\n\t' if self.append_ascii_viz and len(ascii_gs) > 0: result = result + '\n\t'.join(ascii_gs) if len(addons) > 0: # result=result+'\n\t'+'\n\t'.join(addons) result = result + '\n\t'.join(addons) if pats: result = '\n\t'.join([result, cli_cmd, '\n\t'.join(pats)]) if self.enable_chunks_parse: result = result + '\n' # clipboard.copy(result+'\n') clipboard.copy(result) if interact_mode and says is not None: tools.say(ctx.sents_map[says], says)