def get_domains(sents, lang, engine='corenlp', options=None): """ >>> from sagas.nlu.legacy.aiobj_kit import get_domains >>> get_domains('你有几台笔记本电脑?', 'zh', 'ltp') >>> get_domains('列出上周编辑的文件。', 'zh', 'ltp', DomainGetOptions(enable_predicts=True)) :param sents: :param lang: :param engine: :param options: :return: """ # from IPython.display import display if options is None: options=DomainGetOptions() pipelines=['predicts'] if options.enable_predicts else [] doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines) result_set=[] if doc_jsonify is not None: tc.emp('cyan', resp) if resp is not None and 'predicts' in resp and len(resp['predicts'])>0: rs=resp['predicts'] # print(rs) else: # print(doc_jsonify.words_string()) rs = get_chunks(doc_jsonify) if len(rs)>0: if options.list_chunks: list_rs(rs, lang) if options.deps_graph: # display(display_doc_deps(doc_jsonify, resp)) tc.gv(display_doc_deps(doc_jsonify, resp, translit_lang=lang if lang in ('ja', 'ko', 'zh', 'fa', 'ar', 'he') else None)) # rs_represent(rs, data = {'lang': lang, "sents": sents, 'engine': engine, # 'pipelines':pipelines}) data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines':pipelines} for r in rs: # fixture.print_table(r, False) # print(f"lemma: {r['lemma']}") # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) # display(df) domains = r['domains'] common = {'lemma': r['lemma'], 'word': r['word'], 'stems': r['stems']} meta = {'rel': r['rel'], **common, **data} result_set.append((domains, meta)) else: tc.emp('red', '.. no found predefined chunk-patterns.') tc.info(doc_jsonify.words_string()) tc.info(doc_jsonify.dependencies_string()) return result_set
def print_terms_zh(sents, result): from termcolor import colored for verb in result['verbs']: sents = sents.replace(verb, colored(verb, 'green')) for item, value in result.items(): if 'sbv' in item: sents = sents.replace(value, colored(value, 'red')) if 'ob' in item: sents = sents.replace(value, colored(value, 'blue')) # if 'cop' in item: # sents = sents.replace(value, colored(value, 'magenta')) tc.info('%s: %s' % (result['lang'], sents))
def clip_parse(self, source, sents='', specified='default', do_test=False): """ >> clip text: یک آبجو مى خواهم. $ nlu clip_parse fa $ engine='stanford' nluc ar $ nlu clip_parse fi 'Tuolla ylhäällä asuu vanha nainen.' $ nluc nl 'De vrouw heeft verschillende appels.' $ nluc id 'Ini adalah judul buku yang saya baca.' aux $ nluc fi 'Voiko täältä lainata aurinkovarjoa?' default True :param source: :return: """ from sagas.nlu.uni_remote import dep_parse from sagas.nlu.common import get_from_clip from sagas.conf.conf import cf from sagas.nlu.uni_remote_viz import list_chunks from sagas.nlu.utils import fix_sents if sents=='': sents = get_from_clip() if sents.strip()=='': tc.info('no text avaliable in clipboard.') return sents=fix_sents(sents, source) tc.info(sents) # Parse the sentence and display it's chunks, domains and contrast translations. engine=cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) if doc_jsonify is None: raise Exception(f'Cannot parse sentence for lang {source}') list_chunks(doc_jsonify, resp, source, enable_contrast=True, specified=None if specified=='default' else specified) words = [word.text for word in doc_jsonify.words] self.contrast(sents, source, word_map=words) ## visual tree self.main_domains(sents, source, engine, False) ## add rulesets procs from sagas.nlu.inferencer import do_infers cli_cmd, pats = do_infers(sents, source) if do_test: for pat in pats: self.check_rule(sents, source, pat)
def list_rs(rs, lang): # from IPython.display import display from termcolor import colored tc.emp('cyan', f"✁ chunks. {'-' * 25}") for serial, r in enumerate(rs): df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) if 'head' in r: cla = "%s/%s(%s)" % (r['head_word'], r['head'], r['head_pos']) else: cla = '_' tc.info(serial_numbers[serial], '%s(%s)' % (r['type'], r['lemma']), cla) # sagas.print_df(df) tc.dfs(trunc_cols(df)) print_stem_chunks(r) list_synsets(r, lang)
def universal_viz(intp, sents): from sagas.nlu.uni_parser import get_chunks from sagas.tool.misc import print_stem_chunks import sagas doc = intp(sents) doc.build_dependencies() # print(doc.dependencies) rs = get_chunks(doc) # print(rs) for r in rs: df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) tc.info('%s(%s)' % (r['type'], r['lemma'])) tc.dfs(df) # display(df) print_stem_chunks(r) cv = EnhancedViz(shape='egg', size='8,5', fontsize=20) return cv.analyse_doc(doc, None)
def _(self, lang, text, *sents): tc.info(type(self).__name__, isinstance(self, Keeper), text, lang) # data = {'lang': lang, "sents": text, 'engine': 'corenlp', 'disable_predicts': False} # domains, meta = self.request_domains(data) # engine = 'ltp' if lang == 'zh' else 'corenlp' engine = cf.engine(lang) domain_set = get_domains(text, lang, engine) for domains, meta in domain_set: # print(f"{meta['lemma']} ({meta['phonetic']}, {meta['word']})") # print(f"{meta['lemma']}") # execute rulesets tc.info('rules', [r.name for r in self.rulesets]) for i, ruleset in enumerate(self.rulesets): # print(colored(f"✁ {i}. {'-' * 25}", 'cyan')) tc.emp('cyan', f"✁ {i}. {'-' * 25}") rule_rs = ruleset(domains, meta, self, sents) display_result_df(rule_rs) if isinstance(self, Keeper): return self.callback(text) return None
def dep_parse(sents:Text, lang:Text='en', engine='stanza', pipelines:List[Text]=None, doc_impl=None)-> (SentenceIntf, Dict): if pipelines is None: pipelines = [] if doc_impl is None: doc_impl=cf.extensions('doc.wrapper', engine) data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines':pipelines} logger.debug(f".. request is {data}") # tc.info(data['sents']) response = requests.post(f'{cf.servant(engine)}/dep_parse', json=data) if response.status_code != 200: tc.info('.. dep_parse servant invoke fail.') return None, None result = response.json() words=result['sents'] if len(words) == 0: tc.info('.. dep_parse servant returns empty set.') tc.info('.. request data is', data) return None, None # print('.......') doc_jsonify = doc_impl(words, text=sents) if len(pipelines)>0: result_set={p:result[p] for p in pipelines} return doc_jsonify, result_set return doc_jsonify, None
def __call__(self, domains, meta, ctx=None, param_sents=None): rule_rs = self.rules(domains, meta) # .. parts {'sbv': '你', 'vob': '电脑', 'wp': '?'} tc.info('.. parts', {k: v for k, v in rule_rs[0][3].lemmas.items()}) if all([val[1] for val in rule_rs]): results = [el for r in rule_rs for el in r[3].results] # .. results # ('ins_rasa', 'vob', {'intent': 'how_many', 'confidence': 0.9721028208732605}) if len(results) > 0: tc.info('.. results') tc.info([f"{r[0]}/{r[1]}" for r in results]) # color_print('blue', json.dumps(results, indent=2, ensure_ascii=False)) tc.emp('blue', results) # 如果kwargs不为空, 则利用kwargs的规则集来检测param_sents, # 将得到的inspectors结果集放入对应的参数名中, # 与rules的结果集results一起作为参数值来调用executor. if len(self.parameters) > 0: tc.emp('red', 'parameters -> %s' % ', '.join(self.parameters.keys())) if param_sents is not None: tc.emp('yellow', '; '.join(param_sents)) # .. matched: how_many_artifact if ctx is not None: self.executor(ctx) else: self.executor(self.name) return rule_rs
def print_dependencies(self, doc, segs, node_maps, verbose=False): for dep_edge in doc.dependencies: if verbose: tc.info((dep_edge[2].text, dep_edge[0].index, dep_edge[1])) # head = int(dep_edge[0].index) # governor-id is index in words list + 1 rel = dep_edge[1] if rel.endswith('comp'): self.f.attr('edge', style='dashed') else: self.f.attr('edge', style='solid') head = int(dep_edge[0].index) - 1 node_text = node_maps[dep_edge[2].text] if head==-1: # print("%s's head is root %s"%(node_text, segs[head])) head_node='ROOT' else: head_node=segs[head] self.f.edge(head_node, node_text, label=self.fix_console_label(rel), fontsize='11', fontname='Calibri')
def run(self, text, langs=None, top_result=5, summary=False, verbose=True): """ $ python -m sagas.corpus.searcher run 'I read a letter.' $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id,fa 2 True False :param text: :return: """ # 先按相似度查找到与给定内容近似的英文句子 relevant_quotes, relevant_chapters = self.search(text, ['text', 'chapter'], top_result) summary_info=[] for q in range(top_result): tc.emp('magenta', '>' + relevant_quotes[q]) tc.emp('green', relevant_chapters[q]) if langs is not None: # 因为语料都是按英文作对照的, 所以直接按英文句子查找到其它语言的句子就可以了 # search_in_list('I write a letter.', ['ja', 'fa', 'id']) results=search_in_list(relevant_quotes[q], langs) if verbose: tc.emp('blue', json.dumps(results, indent=2, ensure_ascii=False)) if summary: all_types = [] rs_c=CorpusSearcher.parse_controls(results) for r in rs_c: if r[2]!='': tc.emp('red', f".. {r[2]}") types=sents_summary(r[0], r[1]) all_types.extend(types) summary_info.append((relevant_quotes[q], all_types)) tc.emp('cyan', '✁', '-' * 30) for s in summary_info: tc.info(s)
def request_domains(self, data: Dict[Text, Any], print_format='table', engine=None): import requests import json from sagas.conf.conf import cf from sagas.nlu.rules_meta import build_meta if engine is None: engine = cf.engine(data['lang']) data['engine'] = engine tc.info(f".. request is {data}") response = requests.post(f'{cf.servant(engine)}/verb_domains', json=data) rs = response.json() if len(rs) == 0: tc.info('.. verb_domains servant returns empty set.') tc.info('.. request data is', data) return None, None r = rs[0] # if print_format=='table': # self.print_table(rs) # elif print_format=='jupyter': # self.print_table(rs, False) if print_format != 'json': self.print_table(rs) else: tc.info(json.dumps(r, indent=2, ensure_ascii=False)) domains = r['domains'] # common = {'lemma': r['lemma'], 'word': r['word'], # 'stems': r['stems']} # meta = {'rel': r['rel'], **common, **data} meta = build_meta(r, data) return domains, meta
def __init__(cls, clsname, superclasses, attributedict): # ruleset_stats = cls.rulesets = \ [RuleSet('how_many_artifact_c', rules=lambda d, m: [ # $ sz '你有几台笔记本电脑?' Patterns(d, m, 5).verb(behaveof('have', 'v'), __engine='ltp', vob=intentof('how_many', 0.75)), *actions_vob(d, m, [('have', 'device/artifact'), ]), ], executor=lambda obj: tc.info('red', f'.. object: {obj}'), files=lambda d, m: [ # $ sz '有多少文件' Patterns(d, m, 5).verb(behaveof('have', 'v'), __engine='ltp', a1=kindof('file/communication', 'n')), ], checkers=lambda d, m: [ *actions_vob([('have', 'device/artifact'), ]), ], ), ] BaseMeta.setup(cls)
def contrast(self, text, source, target='en', word_map=None): """ $ nlu contrast '저는 허락을 못 받아서 안 왔어요.' ko :param text: :param source: :param target: :return: """ from sagas.nlu.translator import get_word_map from sagas.nlu.translator import translate from sagas.tool.misc import color_print r, tracker = translate(text, source=source, target=target, options={'get_pronounce'}) tc.info(r) for i, p in enumerate(tracker.pronounce): ps = p[2:] tc.info(f'v{i}="{ps}"') rs, trans_table=get_word_map(source, target, text, words=word_map, local_translit=True if source in contrast_translit_langs else False) for i, (k, r) in enumerate(rs.items()): tc.info(f"{i} - ", r.replace('\n', ' ')) color_print('cyan', ' '.join(trans_table))
def callback(self, t): tc.info(f'** {self.name} callback', t) return self
def analyse_doc(self, sentence, node_maps=None, console=True): from sagas.nlu.uni_intf import sub_comps import unicodedata segs = [] # omit {word.feats} if console: tc.info(*[f'index: {word.index}\ttext: {word.text+" "}\tlemma: {word.lemma}\tupos: {word.upos}\txpos: {word.xpos}' for word in sentence.words], sep='\n') else: # from IPython.display import display import sagas df=sagas.to_df([(word.index, word.text, word.lemma, word.upos, word.xpos, word.entity) for word in sentence.words], ['index', 'text', 'lemma', 'upos', 'xpos', 'entity']) tc.dfs(df) def translit_chunk(chunk:str, lang): from sagas.nlu.transliterations import translits # if upos=='PUNCT': # return chunk if chunk.strip() in (',','.',';','?','!'): return chunk # if lang in ('ko', 'ja', 'fa', 'hi', 'ar'): if translits.is_available_lang(lang): if sa_env.runtime!='default': return word.text+'\n'+translits.translit(chunk, lang) return translits.translit(chunk, lang) return chunk if node_maps is None: node_maps = {} for word in sentence.words: pos_attrs=f"({word.upos.lower()}, {word.xpos.lower()})" node_text=word.text if self.translit_lang is None or word.upos=='PUNCT' \ else translit_chunk(word.text, self.translit_lang) # node_text=unicodedata.normalize('NFKC', node_text) if word.upos=='PUNCT' else node_text norm=lambda t: unicodedata.normalize('NFKC', t).encode('ascii', 'ignore').decode("utf-8") node_text = norm(node_text) if word.upos == 'PUNCT' else node_text if node_text=='': node_text='_' # verbose if word.text!=node_text: print('# ', f"{word.text} -> {node_text}") node_maps[word.text] = node_text if not self.enable_node_pos else f"{node_text}\\n{pos_attrs}" # self.f.attr(color='black') prop_sets = {'VERB': lambda f: f.attr('node', style='filled', color='lightgrey'), 'PRON': lambda f: f.attr('node', style='dashed', color='red'), 'AUX': lambda f: f.attr('node', style='dashed', color='green'), 'NOUN': lambda f: f.attr('node', style='solid', color='blue'), } # sentence = doc.sentences[0] for word in sentence.words: rel = word.dependency_relation if rel in sub_comps: if word.upos == 'VERB': self.f.attr('node', style='filled', color='antiquewhite') elif word.upos in prop_sets: prop_sets[word.upos](self.f) else: self.default_node() # for all languages elif rel.endswith('comp'): self.f.attr('node', style='filled', color='antiquewhite') elif word.upos in prop_sets: prop_sets[word.upos](self.f) else: self.default_node() head = '' if word.governor == 0: head = '_root_' else: head_word = sentence.words[word.governor - 1] head = head_word.text # print(f"{word.text} -> {rel}, {word.governor}, {head}") self.f.node(node_maps[word.text]) segs.append(node_maps[word.text]) # self.f.node_attr.update(color='black') self.default_node() self.print_dependencies(sentence, segs, node_maps) return self.f
def rs_represent(rs: List[Any], data: Dict[Text, Any], return_df=False): import sagas from sagas.nlu.rules import verb_patterns, aux_patterns, subj_patterns, predict_patterns from sagas.nlu.rules_lang_spec import langspecs from sagas.nlu.nlu_cli import NluCli from sagas.nlu.sinkers import Sinkers df_set = [] result = [] sinkers = Sinkers(data, rs[0]['type']) for serial, r in enumerate(rs): type_name = r['type'] meta = build_meta(r, data) if type_name == 'verb_domains': theme = '[verb]' tc.info( serial_numbers[serial], theme, # r['lemma'], r['index'], f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}", '(%s, %s)' % (r['rel'], r['governor'])) # meta = {'rel': r['rel'], **common, **data} verb_patterns(meta, r['domains']) elif type_name == 'aux_domains': theme = '[aux]' # 'rel': word.dependency_relation, 'governor': word.governor, 'head': dc.text delegator = '☇' if not r['delegator'] else '☌' tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'], delegator, "%s(%s)" % (r['head'], r['head_pos'])) # verb_patterns(r['domains']) # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data} aux_patterns(meta, r['domains']) elif type_name == 'subj_domains': theme = '[subj]' tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'], '☇', f"{r['head']}") # verb_patterns(r['domains']) # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data} subj_patterns(meta, r['domains']) elif type_name == 'predicate': theme = '[predicates]' tc.info(serial_numbers[serial], theme, f"{r['lemma']} ({r['phonetic']}, {r['word']})") # meta = {'rel': r['rel'], **common, **data} predict_patterns(meta, r['domains']) elif type_name == 'root_domains': theme = '[root]' tc.info( serial_numbers[serial], theme, f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}", '(%s, %s)' % (r['rel'], r['governor'])) # meta = {'rel': r['rel'], **common, **data} # verb_patterns(meta, r['domains']) # check_langspec(data['lang'], meta, r['domains'], type_name) else: # meta = {} raise Exception( 'Cannot process specific type: {}'.format(type_name)) # process language special rules logger.debug(f"meta keys {meta.keys()}") mod_rs = langspecs.check_langspec(data['lang'], meta, r['domains'], type_name) sinkers.add_module_results(mod_rs) # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'children', 'features']) df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) df_set.append(df) if not return_df: result.extend( proc_word(type_name, r['word'], r['head_word'] if 'head_word' in r else '', data['lang'])) result.extend( proc_children_column(df['rel'], df['children'], data['lang'])) # print('.......') # where 1 is the axis number (0 for rows and 1 for columns.) # df = df.drop('children', 1) # df['children'] = df['children'].apply(lambda x: ', '.join(x)[:15] + "..") # df['features'] = df['features'].apply(lambda x: ', '.join(x)[:15] + "..") trunc_cols(df) tc.dfs(df) print_stem_chunks(r) if print_def: NluCli().get_word_def(r['lemma'], data['lang']) if print_synsets: r = display_synsets(theme, meta, r, data['lang']) result.extend(r) sinkers.process_with_sinkers() return result, df_set