def langspec(self, sents, lang='en', engine='corenlp', print_no_match=False): """ $ python -m sagas.nlu.rules_lang_spec langspec 'Berapa umur kamu?' id $ python -m sagas.nlu.rules_lang_spec langspec 'Siapa yang menulis laporan ini?' id ♯ matched id rules: {'ask_event': 0} features -> ['ask_event'] $ python -m sagas.nlu.rules_lang_spec langspec 'Die Nutzung der Seite ist kostenlos.' de $ python -m sagas.nlu.rules_lang_spec langspec 'I want to play music.' en corenlp True $ python -m sagas.nlu.rules_lang_spec langspec 'このお土産はきれいで安いです。' ja knp True :param sents: :param lang: :param engine: :return: """ from sagas.nlu.uni_remote import dep_parse from sagas.nlu.uni_parser import get_chunks from sagas.conf.conf import cf if print_no_match: cf.enable_opt('print_not_matched') pipelines = ['predicts'] meta={'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines} # doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines) doc_jsonify, resp =parse_sents(meta) rs = get_chunks(doc_jsonify) rs_repr(rs, data=meta)
def langspec_id(self, sents, engine='corenlp'): """ $ python -m sagas.nlu.rules_lang_spec langspec_id 'Berapa umur kamu?' :param sents: :param engine: :return: """ from sagas.nlu.uni_remote import dep_parse from sagas.nlu.uni_parser import get_chunks from sagas.nlu.rules_lang_spec_de import Rules_de from sagas.nlu.rules_lang_spec_id import Rules_id pipelines = ['predicts'] lang='id' doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines) rs = get_chunks(doc_jsonify) # rs_repr(rs, data={'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines}) data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines} for serial, r in enumerate(rs): common = {'lemma': r['lemma'], 'word': r['word'], 'stems': r['stems']} meta = {'rel': r['rel'], **common, **data} c=Rules_id(meta, r['domains'], doc=doc_jsonify) c.root_rules() c.execute()
def viz_check(parser, lang, sents): """ >>> from sagas.nlu.uni_impl_hanlp import HanlpParserImpl >>> # from sagas.nlu.uni_viz_checker import * >>> parser=HanlpParserImpl >>> viz_check(parser, 'zh', '我必须关掉房间里的灯。') >>> from sagas.nlu.uni_impl_knp import KnpParserImpl >>> viz_check(KnpParserImpl, 'ja', '私の趣味は、多くの小旅行をすることです。') :param parser: :param lang: :param sents: :return: """ from IPython.display import display doc = parser(lang)(sents) rs = get_chunks(doc) for serial, r in enumerate(rs): df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) if 'head' in r: cla = "%s(%s)" % (r['head'], r['head_pos']) else: cla = '_' print(serial_numbers[serial], '%s(%s)' % (r['type'], r['lemma']), cla) # sagas.print_df(df) display(df) print_stem_chunks(r) cv = EnhancedViz(shape='egg', size='8,5', fontsize=20) return cv.analyse_doc(doc, None)
def parse_comps(sents, source): sents = fix_sents(sents, source) engine = cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) if len(resp['predicts']) > 0: rs = resp['predicts'] else: rs = get_chunks(doc_jsonify) return rs
def get_domains(sents, lang, engine='corenlp', options=None): """ >>> from sagas.nlu.legacy.aiobj_kit import get_domains >>> get_domains('你有几台笔记本电脑?', 'zh', 'ltp') >>> get_domains('列出上周编辑的文件。', 'zh', 'ltp', DomainGetOptions(enable_predicts=True)) :param sents: :param lang: :param engine: :param options: :return: """ # from IPython.display import display if options is None: options=DomainGetOptions() pipelines=['predicts'] if options.enable_predicts else [] doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines) result_set=[] if doc_jsonify is not None: tc.emp('cyan', resp) if resp is not None and 'predicts' in resp and len(resp['predicts'])>0: rs=resp['predicts'] # print(rs) else: # print(doc_jsonify.words_string()) rs = get_chunks(doc_jsonify) if len(rs)>0: if options.list_chunks: list_rs(rs, lang) if options.deps_graph: # display(display_doc_deps(doc_jsonify, resp)) tc.gv(display_doc_deps(doc_jsonify, resp, translit_lang=lang if lang in ('ja', 'ko', 'zh', 'fa', 'ar', 'he') else None)) # rs_represent(rs, data = {'lang': lang, "sents": sents, 'engine': engine, # 'pipelines':pipelines}) data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines':pipelines} for r in rs: # fixture.print_table(r, False) # print(f"lemma: {r['lemma']}") # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) # display(df) domains = r['domains'] common = {'lemma': r['lemma'], 'word': r['word'], 'stems': r['stems']} meta = {'rel': r['rel'], **common, **data} result_set.append((domains, meta)) else: tc.emp('red', '.. no found predefined chunk-patterns.') tc.info(doc_jsonify.words_string()) tc.info(doc_jsonify.dependencies_string()) return result_set
def list_chunks(doc_jsonify, resp, lang, enable_contrast=False, specified=None): if len(resp['predicts']) > 0 and specified is None: rs = resp['predicts'] else: rs = get_chunks(doc_jsonify, return_root_chunks_if_absent=True, specified=specified) list_rs(rs, lang) if enable_contrast: _ = list_contrast(rs, lang)
def exec_rules(self, sents, lang='en', engine='corenlp'): """ $ python -m sagas.tool.misc exec_rules "今何時ですか?" ja $ python -m sagas.tool.misc exec_rules "今何時ですか?" ja knp $ python -m sagas.tool.misc exec_rules "望遠鏡で泳いでいる少女を見た。" ja knp $ python -m sagas.tool.misc exec_rules 'Мы написали три книги за год.' ru $ python -m sagas.tool.misc exec_rules "现在是几点?" zh ltp $ rules '我在臺灣開計程車。' zh $ rules '我在台湾开出租车。' zh ltp $ rules "吸烟对你的健康有害。" zh ltp $ rules 'Tini berumur sepuluh tahun.' id $ rules 'Berapa umur kamu?' id (因为找不到预定义的chunks模式, 所以会输出所有单词和依赖关系) :param sents: :param lang: :param engine: :return: """ from sagas.nlu.uni_parser import get_chunks from sagas.nlu.uni_remote import dep_parse pipelines = ['predicts'] doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines) if doc_jsonify is not None: color_print('cyan', resp) if len(resp['predicts']) > 0: rs_represent(resp['predicts'], data={ 'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines }) else: rs = get_chunks(doc_jsonify) if len(rs) > 0: # rs_summary(rs) rs_represent(rs, data={ 'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines }) else: color_print('red', '.. no found predefined chunk-patterns.') print(doc_jsonify.words_string()) print(doc_jsonify.dependencies_string())
def print_sents(self, sents, lang, engine=None): """ $ python -m sagas.nlu.ruleset_procs print_sents 'I want to play music.' en $ python -m sagas.nlu.ruleset_procs print_sents "クモは4つの右の目をしています。" ja corenlp :param sents: :param lang: :return: """ # lang = 'en' if engine is None: engine = cf.engine(lang) data = {'lang': lang, "sents": sents, 'engine': engine} doc_jsonify, resp = parse_sents(data) rs = get_chunks(doc_jsonify) if lang in non_spaces: delim = '' else: delim = ' ' for serial, r in enumerate(rs): meta = build_meta(r, data) domains = r['domains'] # print([(x[0], x[2]) for x in domains]) # keys = {x[0] for x in domains} grp = lambda p, idx: [x[idx] for x in domains if x[0] == p] tokens = {x: grp(x, 2) for x in keys} words = {x: delim.join(grp(x, 2)) for x in keys} lemmas = {x: delim.join(grp(x, 3)) for x in keys} print('meta keys', meta.keys()) print('tokens', tokens) print('words', meta['word'], words) print('lemmas', lemmas) # ctx = Context(meta, domains) # print(ctx.lemmas) print('chunks', ctx._chunks) g = display_doc_deps(doc_jsonify, resp, translit_lang=lang) print(*[(w.index, w.text, w.governor, doc_jsonify.words[w.governor - 1].text) for w in doc_jsonify.words], sep='\n') tc.gv(g)
def universal_viz(intp, sents): from sagas.nlu.uni_parser import get_chunks from sagas.tool.misc import print_stem_chunks import sagas doc = intp(sents) doc.build_dependencies() # print(doc.dependencies) rs = get_chunks(doc) # print(rs) for r in rs: df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) tc.info('%s(%s)' % (r['type'], r['lemma'])) tc.dfs(df) # display(df) print_stem_chunks(r) cv = EnhancedViz(shape='egg', size='8,5', fontsize=20) return cv.analyse_doc(doc, None)
def parse(self, engine, lang, sents): """ $ python -m sagas.nlu.uni_cli parse corenlp en 'it is a cat' $ python -m sagas.nlu.uni_cli parse ltp zh-CN '我送她一束花' $ python -m sagas.nlu.uni_cli parse analz zh '我送她一束花' $ python -m sagas.nlu.uni_cli parse hanlp zh-CN '我送她一束花' $ python -m sagas.nlu.uni_cli parse spacy en 'it is a cat' $ python -m sagas.nlu.uni_cli parse knp ja '私の趣味は、多くの小旅行をすることです。' $ python -m sagas.nlu.uni_cli parse knp ja 'ケーキの甘みが好きじゃなかった。' $ python -m sagas.nlu.uni_cli parse spacy_2.2 lt 'Ji dirba prie kompiuterio.' $ python -m sagas.nlu.uni_cli parse stanford en 'it is a cat' :return: """ from sagas.nlu.uni_parser import get_chunks import sagas from sagas.tool.misc import print_stem_chunks from sagas.tool.misc import color_print from pprint import pprint def print_r(r): df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) print('%s(%s)' % (r['type'], r['lemma'])) sagas.print_df(df) print_stem_chunks(r) engine = engine.split('_')[0] print(f'using engine {engine} ...') # parser = CoreNlpParserImpl('en') # doc = parser('it is a cat') doc = self._parsers[engine](lang, sents) # color_print('blue', doc.predicts) pprint(doc.predicts) if doc.has_predicts(): for r in doc.predicts: print_r(r) else: rs = get_chunks(doc) for r in rs: print_r(r)
def viz_sample(self, lang:Text, sents:Text, engine='corenlp'): """ $ python -m sagas.nlu.uni_jsonifier viz_sample ja "今何時ですか?" >>> viz_sample('ja', "今何時ですか?") :param lang: :param sents: :param engine: :return: """ from sagas.nlu.uni_cli import UniCli from sagas.nlu.uni_parser import get_chunks uni = UniCli() doc = uni.parser(engine)(lang, sents) # print(len(doc.words)) words = sent_jsonify(doc) doc_jsonify = JsonifySentImpl(words, text=sents) rs = get_chunks(doc_jsonify) rs_summary(rs)
def sents_summary(sents, source): from sagas.nlu.uni_remote import dep_parse from sagas.nlu.uni_remote_viz import list_contrast from sagas.conf.conf import cf from sagas.nlu.utils import fix_sents from sagas.nlu.uni_parser import get_chunks sents=fix_sents(sents, source) engine=cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) types=[] if doc_jsonify is None: raise Exception(f'Cannot parse sentence for lang {source}') if len(resp['predicts']) > 0: rs=resp['predicts'] else: rs = get_chunks(doc_jsonify) for serial, r in enumerate(rs): print(f"{serial}. {r['type']} -> {r['word']}") types.append(f"{source}:{r['type']}") list_contrast(rs, source) return types
def handle_verb_domains(): # from sagas.nlu.uni_parser import get_verb_domain, get_aux_domain, get_subj_domain from sagas.nlu.uni_parser import get_chunks from sagas.nlu.uni_cli import parse_with content = request.get_json() sents = content['sents'] lang = content['lang'] engine = get_engine(lang, content) sents = fix_sents(lang, sents) # nlp = get_nlp(lang) # doc = nlp(sents) # sent = doc.sentences[0] logger.debug( f".. parse sents [{sents}] with engine {engine} in lang {lang}") sent = parse_with(sents, lang, engine=engine) # r = get_verb_domain(sent, ['obl', 'nsubj:pass']) # if len(r)==0: # r=get_aux_domain(sent, ['obl', 'nsubj:pass']) # if len(r)==0: # r = get_subj_domain(sent) disable_predicts = is_disabled(content, 'disable_predicts') predicts_count = len(sent.predicts) if predicts_count > 0 and not disable_predicts: r = sent.predicts else: logger.debug( f"predicts count is {predicts_count}, option disable_predicts is {disable_predicts}" ) r = get_chunks(sent) data_y = json.dumps(r, ensure_ascii=False) return data_y
def dep_parse(self, sents, lang='en', engine='corenlp'): """ $ python -m sagas.tool.misc dep_parse 'Мы написали три книги за год.' ru $ python -m sagas.tool.misc dep_parse "今何時ですか?" ja $ python -m sagas.tool.misc dep_parse "今何時ですか?" ja knp $ python -m sagas.tool.misc dep_parse "私の趣味は、多くの小旅行をすることです。" ja knp $ python -m sagas.tool.misc dep_parse "自由を手に入れる" ja $ python -m sagas.tool.misc dep_parse "现在是几点?" zh ltp :param sents: :param lang: :param engine: :return: """ from sagas.nlu.uni_jsonifier import rs_summary from sagas.nlu.uni_parser import get_chunks from sagas.nlu.uni_remote import dep_parse doc_jsonify, resp = dep_parse(sents, lang, engine, ['predicts']) rs = get_chunks(doc_jsonify) rs_summary(rs) print('-' * 25, 'predicts') pprint(resp) print('-' * 25, 'doc') pprint(doc_jsonify.as_json)
def predict(self, data: Dict[Text, Any], rule_str: Text, name='_none_', engine=None, graph=False, operator=all) -> bool: """ >>> from sagas.tool.dynamic_rules import DynamicRules >>> data = {'lang': 'ja', "sents": '彼のパソコンは便利じゃない。'} >>> DynamicRules().predict(data, "subj('adj',ガ=kindof('artifact', 'n'))", engine='knp') :param data: :param rule_str: :param name: :param engine: :return: """ import sagas.tracker_fn as tc from sagas.kit.analysis_kit import AnalysisKit # ft=InspectorFixture() # domains, meta=ft.request_domains(data, engine=engine) if engine is None: engine = cf.engine(data['lang']) pipelines = ['predicts'] tc.emp('magenta', f"({data['lang']}) {data['sents']}") doc_jsonify, resp = dep_parse(data['sents'], data['lang'], engine, pipelines) if doc_jsonify is not None: if len(resp['predicts']) > 0: domains_set = resp['predicts'] else: domains_set = get_chunks(doc_jsonify) if graph: AnalysisKit().console_vis(data['sents'], data['lang']) check_r = [] for r in domains_set: domains = r['domains'] meta = build_meta(r, data) print(r['type'], meta['index'], meta['word'], meta['lemma'], list(meta.keys())) position = doc_jsonify.get_position(meta['index']) pprint(domains) # agency = ['c_pron', 'c_noun'] pat = lambda p, name='': Patterns( domains, meta, p, name=name, doc=doc_jsonify) # rs = interp(f"[Patterns(domains, meta, {self.priority}, name='{name}').{rule_str}]", if rule_str.startswith('pat('): pattern_text = f"[{rule_str}]" else: pattern_text = f"[pat({self.priority}, name='{name}').{rule_str}]" rs = interp(pattern_text, domains, meta, pat) print_result(rs) # collect matched context's results # r[1] is true/false, r[3] is context results = [el for r in rs for el in r[3].results if r[1]] # r[2] is priority succ = [abs(r[2]) for r in rs if r[1]] priority = max(succ) if len(succ) > 0 else 0 self.priority_list.append(priority) self.result_set.extend(results) self.rasa_ents.append({ 'confidence': None, 'start': position[0], 'end': position[1], 'entity': r['type'], 'extractor': 'ruleset', 'value': f"{meta['word']}/{meta['lemma']}", 'additional_info': results, }) check_r.append(operator([r[1] for r in rs])) return operator(check_r) return False
def get_chunks(self, sent): from sagas.nlu.uni_parser import get_chunks return get_chunks(sent)