Exemple #1
0
    def langspec(self, sents, lang='en', engine='corenlp', print_no_match=False):
        """
        $ python -m sagas.nlu.rules_lang_spec langspec 'Berapa umur kamu?' id
        $ python -m sagas.nlu.rules_lang_spec langspec 'Siapa yang menulis laporan ini?' id
            ♯ matched id rules: {'ask_event': 0}
            features -> ['ask_event']
        $ python -m sagas.nlu.rules_lang_spec langspec 'Die Nutzung der Seite ist kostenlos.' de
        $ python -m sagas.nlu.rules_lang_spec langspec 'I want to play music.' en corenlp True
        $ python -m sagas.nlu.rules_lang_spec langspec 'このお土産はきれいで安いです。' ja knp True

        :param sents:
        :param lang:
        :param engine:
        :return:
        """
        from sagas.nlu.uni_remote import dep_parse
        from sagas.nlu.uni_parser import get_chunks
        from sagas.conf.conf import cf

        if print_no_match:
            cf.enable_opt('print_not_matched')
        pipelines = ['predicts']
        meta={'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines}
        # doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines)
        doc_jsonify, resp =parse_sents(meta)
        rs = get_chunks(doc_jsonify)

        rs_repr(rs, data=meta)
Exemple #2
0
    def langspec_id(self, sents, engine='corenlp'):
        """
        $ python -m sagas.nlu.rules_lang_spec langspec_id 'Berapa umur kamu?'
        :param sents:
        :param engine:
        :return:
        """
        from sagas.nlu.uni_remote import dep_parse
        from sagas.nlu.uni_parser import get_chunks
        from sagas.nlu.rules_lang_spec_de import Rules_de
        from sagas.nlu.rules_lang_spec_id import Rules_id

        pipelines = ['predicts']
        lang='id'
        doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines)
        rs = get_chunks(doc_jsonify)
        # rs_repr(rs, data={'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines})
        data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines': pipelines}
        for serial, r in enumerate(rs):
            common = {'lemma': r['lemma'], 'word': r['word'],
                      'stems': r['stems']}
            meta = {'rel': r['rel'], **common, **data}
            c=Rules_id(meta, r['domains'], doc=doc_jsonify)
            c.root_rules()
            c.execute()
def viz_check(parser, lang, sents):
    """
    >>> from sagas.nlu.uni_impl_hanlp import HanlpParserImpl
    >>> # from sagas.nlu.uni_viz_checker import *
    >>> parser=HanlpParserImpl
    >>> viz_check(parser, 'zh', '我必须关掉房间里的灯。')
    >>> from sagas.nlu.uni_impl_knp import KnpParserImpl
    >>> viz_check(KnpParserImpl, 'ja', '私の趣味は、多くの小旅行をすることです。')
    :param parser:
    :param lang:
    :param sents:
    :return:
    """
    from IPython.display import display

    doc = parser(lang)(sents)
    rs = get_chunks(doc)

    for serial, r in enumerate(rs):
        df = sagas.to_df(
            r['domains'],
            ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        if 'head' in r:
            cla = "%s(%s)" % (r['head'], r['head_pos'])
        else:
            cla = '_'
        print(serial_numbers[serial], '%s(%s)' % (r['type'], r['lemma']), cla)
        # sagas.print_df(df)
        display(df)
        print_stem_chunks(r)

    cv = EnhancedViz(shape='egg', size='8,5', fontsize=20)
    return cv.analyse_doc(doc, None)
Exemple #4
0
def parse_comps(sents, source):
    sents = fix_sents(sents, source)

    engine = cf.engine(source)
    doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts'])
    if len(resp['predicts']) > 0:
        rs = resp['predicts']
    else:
        rs = get_chunks(doc_jsonify)
    return rs
Exemple #5
0
def get_domains(sents, lang, engine='corenlp', options=None):
    """
    >>> from sagas.nlu.legacy.aiobj_kit import get_domains
    >>> get_domains('你有几台笔记本电脑?', 'zh', 'ltp')
    >>> get_domains('列出上周编辑的文件。', 'zh', 'ltp', DomainGetOptions(enable_predicts=True))

    :param sents:
    :param lang:
    :param engine:
    :param options:
    :return:
    """
    # from IPython.display import display

    if options is None:
        options=DomainGetOptions()
    pipelines=['predicts'] if options.enable_predicts else []
    doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines)
    result_set=[]
    if doc_jsonify is not None:
        tc.emp('cyan', resp)
        if resp is not None and 'predicts' in resp and len(resp['predicts'])>0:
            rs=resp['predicts']
            # print(rs)
        else:
            # print(doc_jsonify.words_string())
            rs = get_chunks(doc_jsonify)
        if len(rs)>0:
            if options.list_chunks:
                list_rs(rs, lang)
            if options.deps_graph:
                # display(display_doc_deps(doc_jsonify, resp))
                tc.gv(display_doc_deps(doc_jsonify, resp,
                                       translit_lang=lang if lang in ('ja', 'ko', 'zh', 'fa', 'ar', 'he') else None))
            # rs_represent(rs, data = {'lang': lang, "sents": sents, 'engine': engine,
            #                         'pipelines':pipelines})
            data = {'lang': lang, "sents": sents, 'engine': engine,
                                     'pipelines':pipelines}
            for r in rs:
                # fixture.print_table(r, False)
                # print(f"lemma: {r['lemma']}")
                # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features'])
                # display(df)
                domains = r['domains']
                common = {'lemma': r['lemma'], 'word': r['word'],
                          'stems': r['stems']}
                meta = {'rel': r['rel'], **common, **data}
                result_set.append((domains, meta))
        else:
            tc.emp('red', '.. no found predefined chunk-patterns.')
            tc.info(doc_jsonify.words_string())
            tc.info(doc_jsonify.dependencies_string())
    return result_set
def list_chunks(doc_jsonify,
                resp,
                lang,
                enable_contrast=False,
                specified=None):
    if len(resp['predicts']) > 0 and specified is None:
        rs = resp['predicts']
    else:
        rs = get_chunks(doc_jsonify,
                        return_root_chunks_if_absent=True,
                        specified=specified)
    list_rs(rs, lang)
    if enable_contrast:
        _ = list_contrast(rs, lang)
Exemple #7
0
    def exec_rules(self, sents, lang='en', engine='corenlp'):
        """
        $ python -m sagas.tool.misc exec_rules "今何時ですか?" ja
        $ python -m sagas.tool.misc exec_rules "今何時ですか?" ja knp
        $ python -m sagas.tool.misc exec_rules "望遠鏡で泳いでいる少女を見た。" ja knp
        $ python -m sagas.tool.misc exec_rules 'Мы написали три книги за год.' ru
        $ python -m sagas.tool.misc exec_rules "现在是几点?" zh ltp
        $ rules '我在臺灣開計程車。' zh
        $ rules '我在台湾开出租车。' zh ltp
        $ rules "吸烟对你的健康有害。" zh ltp
        $ rules 'Tini berumur sepuluh tahun.' id
        $ rules 'Berapa umur kamu?' id  (因为找不到预定义的chunks模式, 所以会输出所有单词和依赖关系)

        :param sents:
        :param lang:
        :param engine:
        :return:
        """
        from sagas.nlu.uni_parser import get_chunks
        from sagas.nlu.uni_remote import dep_parse

        pipelines = ['predicts']
        doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines)
        if doc_jsonify is not None:
            color_print('cyan', resp)
            if len(resp['predicts']) > 0:
                rs_represent(resp['predicts'],
                             data={
                                 'lang': lang,
                                 "sents": sents,
                                 'engine': engine,
                                 'pipelines': pipelines
                             })
            else:
                rs = get_chunks(doc_jsonify)
                if len(rs) > 0:
                    # rs_summary(rs)
                    rs_represent(rs,
                                 data={
                                     'lang': lang,
                                     "sents": sents,
                                     'engine': engine,
                                     'pipelines': pipelines
                                 })
                else:
                    color_print('red',
                                '.. no found predefined chunk-patterns.')
                    print(doc_jsonify.words_string())
                    print(doc_jsonify.dependencies_string())
Exemple #8
0
    def print_sents(self, sents, lang, engine=None):
        """
        $ python -m sagas.nlu.ruleset_procs print_sents 'I want to play music.' en
        $ python -m sagas.nlu.ruleset_procs print_sents "クモは4つの右の目をしています。" ja corenlp

        :param sents:
        :param lang:
        :return:
        """
        # lang = 'en'
        if engine is None:
            engine = cf.engine(lang)
        data = {'lang': lang, "sents": sents, 'engine': engine}
        doc_jsonify, resp = parse_sents(data)
        rs = get_chunks(doc_jsonify)

        if lang in non_spaces:
            delim = ''
        else:
            delim = ' '
        for serial, r in enumerate(rs):
            meta = build_meta(r, data)
            domains = r['domains']
            # print([(x[0], x[2]) for x in domains])
            #
            keys = {x[0] for x in domains}
            grp = lambda p, idx: [x[idx] for x in domains if x[0] == p]
            tokens = {x: grp(x, 2) for x in keys}
            words = {x: delim.join(grp(x, 2)) for x in keys}
            lemmas = {x: delim.join(grp(x, 3)) for x in keys}
            print('meta keys', meta.keys())
            print('tokens', tokens)
            print('words', meta['word'], words)
            print('lemmas', lemmas)
            #
            ctx = Context(meta, domains)
            # print(ctx.lemmas)
            print('chunks', ctx._chunks)

        g = display_doc_deps(doc_jsonify, resp, translit_lang=lang)
        print(*[(w.index, w.text, w.governor,
                 doc_jsonify.words[w.governor - 1].text)
                for w in doc_jsonify.words],
              sep='\n')
        tc.gv(g)
Exemple #9
0
def universal_viz(intp, sents):
    from sagas.nlu.uni_parser import get_chunks
    from sagas.tool.misc import print_stem_chunks
    import sagas

    doc = intp(sents)
    doc.build_dependencies()
    # print(doc.dependencies)
    rs = get_chunks(doc)
    # print(rs)
    for r in rs:
        df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        tc.info('%s(%s)' % (r['type'], r['lemma']))
        tc.dfs(df)
        # display(df)
        print_stem_chunks(r)

    cv = EnhancedViz(shape='egg', size='8,5', fontsize=20)
    return cv.analyse_doc(doc, None)
Exemple #10
0
    def parse(self, engine, lang, sents):
        """
        $ python -m sagas.nlu.uni_cli parse corenlp en 'it is a cat'
        $ python -m sagas.nlu.uni_cli parse ltp zh-CN '我送她一束花'
        $ python -m sagas.nlu.uni_cli parse analz zh '我送她一束花'
        $ python -m sagas.nlu.uni_cli parse hanlp zh-CN '我送她一束花'
        $ python -m sagas.nlu.uni_cli parse spacy en 'it is a cat'
        $ python -m sagas.nlu.uni_cli parse knp ja '私の趣味は、多くの小旅行をすることです。'
        $ python -m sagas.nlu.uni_cli parse knp ja 'ケーキの甘みが好きじゃなかった。'
        $ python -m sagas.nlu.uni_cli parse spacy_2.2 lt 'Ji dirba prie kompiuterio.'
        $ python -m sagas.nlu.uni_cli parse stanford en 'it is a cat'

        :return:
        """
        from sagas.nlu.uni_parser import get_chunks
        import sagas
        from sagas.tool.misc import print_stem_chunks
        from sagas.tool.misc import color_print
        from pprint import pprint

        def print_r(r):
            df = sagas.to_df(
                r['domains'],
                ['rel', 'index', 'text', 'lemma', 'children', 'features'])
            print('%s(%s)' % (r['type'], r['lemma']))
            sagas.print_df(df)
            print_stem_chunks(r)

        engine = engine.split('_')[0]
        print(f'using engine {engine} ...')
        # parser = CoreNlpParserImpl('en')
        # doc = parser('it is a cat')
        doc = self._parsers[engine](lang, sents)
        # color_print('blue', doc.predicts)
        pprint(doc.predicts)
        if doc.has_predicts():
            for r in doc.predicts:
                print_r(r)
        else:
            rs = get_chunks(doc)
            for r in rs:
                print_r(r)
Exemple #11
0
    def viz_sample(self, lang:Text, sents:Text, engine='corenlp'):
        """
        $ python -m sagas.nlu.uni_jsonifier viz_sample ja "今何時ですか?"
        >>> viz_sample('ja', "今何時ですか?")
        :param lang:
        :param sents:
        :param engine:
        :return:
        """
        from sagas.nlu.uni_cli import UniCli
        from sagas.nlu.uni_parser import get_chunks

        uni = UniCli()
        doc = uni.parser(engine)(lang, sents)
        # print(len(doc.words))
        words = sent_jsonify(doc)

        doc_jsonify = JsonifySentImpl(words, text=sents)
        rs = get_chunks(doc_jsonify)
        rs_summary(rs)
Exemple #12
0
def sents_summary(sents, source):
    from sagas.nlu.uni_remote import dep_parse
    from sagas.nlu.uni_remote_viz import list_contrast
    from sagas.conf.conf import cf
    from sagas.nlu.utils import fix_sents
    from sagas.nlu.uni_parser import get_chunks

    sents=fix_sents(sents, source)
    engine=cf.engine(source)
    doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts'])
    types=[]
    if doc_jsonify is None:
        raise Exception(f'Cannot parse sentence for lang {source}')
    if len(resp['predicts']) > 0:
        rs=resp['predicts']
    else:
        rs = get_chunks(doc_jsonify)

    for serial, r in enumerate(rs):
        print(f"{serial}. {r['type']} -> {r['word']}")
        types.append(f"{source}:{r['type']}")
    list_contrast(rs, source)
    return types
Exemple #13
0
def handle_verb_domains():
    # from sagas.nlu.uni_parser import get_verb_domain, get_aux_domain, get_subj_domain
    from sagas.nlu.uni_parser import get_chunks
    from sagas.nlu.uni_cli import parse_with

    content = request.get_json()
    sents = content['sents']
    lang = content['lang']
    engine = get_engine(lang, content)

    sents = fix_sents(lang, sents)

    # nlp = get_nlp(lang)
    # doc = nlp(sents)
    # sent = doc.sentences[0]
    logger.debug(
        f".. parse sents [{sents}] with engine {engine} in lang {lang}")
    sent = parse_with(sents, lang, engine=engine)

    # r = get_verb_domain(sent, ['obl', 'nsubj:pass'])
    # if len(r)==0:
    #     r=get_aux_domain(sent, ['obl', 'nsubj:pass'])
    # if len(r)==0:
    #     r = get_subj_domain(sent)

    disable_predicts = is_disabled(content, 'disable_predicts')
    predicts_count = len(sent.predicts)
    if predicts_count > 0 and not disable_predicts:
        r = sent.predicts
    else:
        logger.debug(
            f"predicts count is {predicts_count}, option disable_predicts is {disable_predicts}"
        )
        r = get_chunks(sent)
    data_y = json.dumps(r, ensure_ascii=False)
    return data_y
Exemple #14
0
    def dep_parse(self, sents, lang='en', engine='corenlp'):
        """
        $ python -m sagas.tool.misc dep_parse 'Мы написали три книги за год.' ru
        $ python -m sagas.tool.misc dep_parse "今何時ですか?" ja
        $ python -m sagas.tool.misc dep_parse "今何時ですか?" ja knp
        $ python -m sagas.tool.misc dep_parse "私の趣味は、多くの小旅行をすることです。" ja knp
        $ python -m sagas.tool.misc dep_parse "自由を手に入れる" ja
        $ python -m sagas.tool.misc dep_parse "现在是几点?" zh ltp
        :param sents:
        :param lang:
        :param engine:
        :return:
        """
        from sagas.nlu.uni_jsonifier import rs_summary
        from sagas.nlu.uni_parser import get_chunks
        from sagas.nlu.uni_remote import dep_parse

        doc_jsonify, resp = dep_parse(sents, lang, engine, ['predicts'])
        rs = get_chunks(doc_jsonify)
        rs_summary(rs)
        print('-' * 25, 'predicts')
        pprint(resp)
        print('-' * 25, 'doc')
        pprint(doc_jsonify.as_json)
Exemple #15
0
    def predict(self,
                data: Dict[Text, Any],
                rule_str: Text,
                name='_none_',
                engine=None,
                graph=False,
                operator=all) -> bool:
        """
        >>> from sagas.tool.dynamic_rules import DynamicRules
        >>> data = {'lang': 'ja', "sents": '彼のパソコンは便利じゃない。'}
        >>> DynamicRules().predict(data, "subj('adj',ガ=kindof('artifact', 'n'))", engine='knp')

        :param data:
        :param rule_str:
        :param name:
        :param engine:
        :return:
        """
        import sagas.tracker_fn as tc
        from sagas.kit.analysis_kit import AnalysisKit

        # ft=InspectorFixture()
        # domains, meta=ft.request_domains(data, engine=engine)
        if engine is None:
            engine = cf.engine(data['lang'])
        pipelines = ['predicts']

        tc.emp('magenta', f"({data['lang']}) {data['sents']}")
        doc_jsonify, resp = dep_parse(data['sents'], data['lang'], engine,
                                      pipelines)
        if doc_jsonify is not None:
            if len(resp['predicts']) > 0:
                domains_set = resp['predicts']
            else:
                domains_set = get_chunks(doc_jsonify)

            if graph:
                AnalysisKit().console_vis(data['sents'], data['lang'])

            check_r = []
            for r in domains_set:
                domains = r['domains']
                meta = build_meta(r, data)
                print(r['type'], meta['index'], meta['word'], meta['lemma'],
                      list(meta.keys()))
                position = doc_jsonify.get_position(meta['index'])
                pprint(domains)
                # agency = ['c_pron', 'c_noun']
                pat = lambda p, name='': Patterns(
                    domains, meta, p, name=name, doc=doc_jsonify)
                # rs = interp(f"[Patterns(domains, meta, {self.priority}, name='{name}').{rule_str}]",
                if rule_str.startswith('pat('):
                    pattern_text = f"[{rule_str}]"
                else:
                    pattern_text = f"[pat({self.priority}, name='{name}').{rule_str}]"
                rs = interp(pattern_text, domains, meta, pat)
                print_result(rs)

                # collect matched context's results
                # r[1] is true/false, r[3] is context
                results = [el for r in rs for el in r[3].results if r[1]]
                # r[2] is priority
                succ = [abs(r[2]) for r in rs if r[1]]
                priority = max(succ) if len(succ) > 0 else 0
                self.priority_list.append(priority)

                self.result_set.extend(results)
                self.rasa_ents.append({
                    'confidence': None,
                    'start': position[0],
                    'end': position[1],
                    'entity': r['type'],
                    'extractor': 'ruleset',
                    'value': f"{meta['word']}/{meta['lemma']}",
                    'additional_info': results,
                })

                check_r.append(operator([r[1] for r in rs]))

            return operator(check_r)

        return False
Exemple #16
0
 def get_chunks(self, sent):
     from sagas.nlu.uni_parser import get_chunks
     return get_chunks(sent)