コード例 #1
0
ファイル: aiobj_kit.py プロジェクト: CoderOverflow/stack
def get_domains(sents, lang, engine='corenlp', options=None):
    """
    >>> from sagas.nlu.legacy.aiobj_kit import get_domains
    >>> get_domains('你有几台笔记本电脑?', 'zh', 'ltp')
    >>> get_domains('列出上周编辑的文件。', 'zh', 'ltp', DomainGetOptions(enable_predicts=True))

    :param sents:
    :param lang:
    :param engine:
    :param options:
    :return:
    """
    # from IPython.display import display

    if options is None:
        options=DomainGetOptions()
    pipelines=['predicts'] if options.enable_predicts else []
    doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines)
    result_set=[]
    if doc_jsonify is not None:
        tc.emp('cyan', resp)
        if resp is not None and 'predicts' in resp and len(resp['predicts'])>0:
            rs=resp['predicts']
            # print(rs)
        else:
            # print(doc_jsonify.words_string())
            rs = get_chunks(doc_jsonify)
        if len(rs)>0:
            if options.list_chunks:
                list_rs(rs, lang)
            if options.deps_graph:
                # display(display_doc_deps(doc_jsonify, resp))
                tc.gv(display_doc_deps(doc_jsonify, resp,
                                       translit_lang=lang if lang in ('ja', 'ko', 'zh', 'fa', 'ar', 'he') else None))
            # rs_represent(rs, data = {'lang': lang, "sents": sents, 'engine': engine,
            #                         'pipelines':pipelines})
            data = {'lang': lang, "sents": sents, 'engine': engine,
                                     'pipelines':pipelines}
            for r in rs:
                # fixture.print_table(r, False)
                # print(f"lemma: {r['lemma']}")
                # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features'])
                # display(df)
                domains = r['domains']
                common = {'lemma': r['lemma'], 'word': r['word'],
                          'stems': r['stems']}
                meta = {'rel': r['rel'], **common, **data}
                result_set.append((domains, meta))
        else:
            tc.emp('red', '.. no found predefined chunk-patterns.')
            tc.info(doc_jsonify.words_string())
            tc.info(doc_jsonify.dependencies_string())
    return result_set
コード例 #2
0
ファイル: misc.py プロジェクト: CoderOverflow/stack
def print_terms_zh(sents, result):
    from termcolor import colored
    for verb in result['verbs']:
        sents = sents.replace(verb, colored(verb, 'green'))
    for item, value in result.items():
        if 'sbv' in item:
            sents = sents.replace(value, colored(value, 'red'))
        if 'ob' in item:
            sents = sents.replace(value, colored(value, 'blue'))
        # if 'cop' in item:
        #     sents = sents.replace(value, colored(value, 'magenta'))
    tc.info('%s: %s' % (result['lang'], sents))
コード例 #3
0
ファイル: nlu_tools.py プロジェクト: CoderOverflow/stack
    def clip_parse(self, source, sents='', specified='default', do_test=False):
        """
        >> clip text: ‫یک آبجو مى خواهم.‬
        $ nlu clip_parse fa
        $ engine='stanford' nluc ar
        $ nlu clip_parse fi 'Tuolla ylhäällä asuu vanha nainen.'
        $ nluc nl 'De vrouw heeft verschillende appels.'
        $ nluc id 'Ini adalah judul buku yang saya baca.' aux
        $ nluc fi 'Voiko täältä lainata aurinkovarjoa?' default True

        :param source:
        :return:
        """
        from sagas.nlu.uni_remote import dep_parse
        from sagas.nlu.common import get_from_clip
        from sagas.conf.conf import cf
        from sagas.nlu.uni_remote_viz import list_chunks
        from sagas.nlu.utils import fix_sents

        if sents=='':
            sents = get_from_clip()
            if sents.strip()=='':
                tc.info('no text avaliable in clipboard.')
                return
        sents=fix_sents(sents, source)
        tc.info(sents)

        # Parse the sentence and display it's chunks, domains and contrast translations.
        engine=cf.engine(source)
        doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts'])
        if doc_jsonify is None:
            raise Exception(f'Cannot parse sentence for lang {source}')

        list_chunks(doc_jsonify, resp, source,
                    enable_contrast=True,
                    specified=None if specified=='default' else specified)
        words = [word.text for word in doc_jsonify.words]
        self.contrast(sents, source, word_map=words)

        ## visual tree
        self.main_domains(sents, source, engine, False)
        ## add rulesets procs
        from sagas.nlu.inferencer import do_infers
        cli_cmd, pats = do_infers(sents, source)
        if do_test:
            for pat in pats:
                self.check_rule(sents, source, pat)
コード例 #4
0
ファイル: uni_remote_viz.py プロジェクト: CoderOverflow/stack
def list_rs(rs, lang):
    # from IPython.display import display
    from termcolor import colored
    tc.emp('cyan', f"✁ chunks. {'-' * 25}")
    for serial, r in enumerate(rs):
        df = sagas.to_df(
            r['domains'],
            ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        if 'head' in r:
            cla = "%s/%s(%s)" % (r['head_word'], r['head'], r['head_pos'])
        else:
            cla = '_'
        tc.info(serial_numbers[serial], '%s(%s)' % (r['type'], r['lemma']),
                cla)
        # sagas.print_df(df)
        tc.dfs(trunc_cols(df))
        print_stem_chunks(r)
        list_synsets(r, lang)
コード例 #5
0
def universal_viz(intp, sents):
    from sagas.nlu.uni_parser import get_chunks
    from sagas.tool.misc import print_stem_chunks
    import sagas

    doc = intp(sents)
    doc.build_dependencies()
    # print(doc.dependencies)
    rs = get_chunks(doc)
    # print(rs)
    for r in rs:
        df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        tc.info('%s(%s)' % (r['type'], r['lemma']))
        tc.dfs(df)
        # display(df)
        print_stem_chunks(r)

    cv = EnhancedViz(shape='egg', size='8,5', fontsize=20)
    return cv.analyse_doc(doc, None)
コード例 #6
0
ファイル: aiobj_base.py プロジェクト: CoderOverflow/stack
        def _(self, lang, text, *sents):
            tc.info(type(self).__name__, isinstance(self, Keeper), text, lang)
            # data = {'lang': lang, "sents": text, 'engine': 'corenlp', 'disable_predicts': False}
            # domains, meta = self.request_domains(data)
            # engine = 'ltp' if lang == 'zh' else 'corenlp'
            engine = cf.engine(lang)
            domain_set = get_domains(text, lang, engine)
            for domains, meta in domain_set:
                # print(f"{meta['lemma']} ({meta['phonetic']}, {meta['word']})")
                # print(f"{meta['lemma']}")
                # execute rulesets
                tc.info('rules', [r.name for r in self.rulesets])
                for i, ruleset in enumerate(self.rulesets):
                    # print(colored(f"✁ {i}. {'-' * 25}", 'cyan'))
                    tc.emp('cyan', f"✁ {i}. {'-' * 25}")
                    rule_rs = ruleset(domains, meta, self, sents)
                    display_result_df(rule_rs)

            if isinstance(self, Keeper):
                return self.callback(text)
            return None
コード例 #7
0
def dep_parse(sents:Text, lang:Text='en', engine='stanza',
              pipelines:List[Text]=None,
              doc_impl=None)-> (SentenceIntf, Dict):
    if pipelines is None:
        pipelines = []
    if doc_impl is None:
        doc_impl=cf.extensions('doc.wrapper', engine)
    data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines':pipelines}
    logger.debug(f".. request is {data}")
    # tc.info(data['sents'])
    response = requests.post(f'{cf.servant(engine)}/dep_parse', json=data)
    if response.status_code != 200:
        tc.info('.. dep_parse servant invoke fail.')
        return None, None

    result = response.json()
    words=result['sents']
    if len(words) == 0:
        tc.info('.. dep_parse servant returns empty set.')
        tc.info('.. request data is', data)
        return None, None

    # print('.......')
    doc_jsonify = doc_impl(words, text=sents)
    if len(pipelines)>0:
        result_set={p:result[p] for p in pipelines}
        return doc_jsonify, result_set
    return doc_jsonify, None
コード例 #8
0
    def __call__(self, domains, meta, ctx=None, param_sents=None):
        rule_rs = self.rules(domains, meta)
        # .. parts {'sbv': '你', 'vob': '电脑', 'wp': '?'}
        tc.info('.. parts', {k: v for k, v in rule_rs[0][3].lemmas.items()})
        if all([val[1] for val in rule_rs]):
            results = [el for r in rule_rs for el in r[3].results]
            # .. results
            # ('ins_rasa', 'vob', {'intent': 'how_many', 'confidence': 0.9721028208732605})
            if len(results) > 0:
                tc.info('.. results')
                tc.info([f"{r[0]}/{r[1]}" for r in results])
                # color_print('blue', json.dumps(results, indent=2, ensure_ascii=False))
                tc.emp('blue', results)

            # 如果kwargs不为空, 则利用kwargs的规则集来检测param_sents,
            # 将得到的inspectors结果集放入对应的参数名中,
            # 与rules的结果集results一起作为参数值来调用executor.
            if len(self.parameters) > 0:
                tc.emp('red',
                       'parameters -> %s' % ', '.join(self.parameters.keys()))
                if param_sents is not None:
                    tc.emp('yellow', '; '.join(param_sents))

            # .. matched: how_many_artifact
            if ctx is not None:
                self.executor(ctx)
            else:
                self.executor(self.name)
        return rule_rs
コード例 #9
0
    def print_dependencies(self, doc, segs, node_maps, verbose=False):
        for dep_edge in doc.dependencies:
            if verbose:
                tc.info((dep_edge[2].text, dep_edge[0].index, dep_edge[1]))
            # head = int(dep_edge[0].index)
            # governor-id is index in words list + 1
            rel = dep_edge[1]

            if rel.endswith('comp'):
                self.f.attr('edge', style='dashed')
            else:
                self.f.attr('edge', style='solid')

            head = int(dep_edge[0].index) - 1
            node_text = node_maps[dep_edge[2].text]

            if head==-1:
                # print("%s's head is root %s"%(node_text, segs[head]))
                head_node='ROOT'
            else:
                head_node=segs[head]

            self.f.edge(head_node, node_text, label=self.fix_console_label(rel),
                        fontsize='11', fontname='Calibri')
コード例 #10
0
    def run(self, text, langs=None, top_result=5, summary=False, verbose=True):
        """
        $ python -m sagas.corpus.searcher run 'I read a letter.'
        $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id
        $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id,fa 2 True False
        :param text:
        :return:
        """
        # 先按相似度查找到与给定内容近似的英文句子
        relevant_quotes, relevant_chapters = self.search(text, ['text', 'chapter'], top_result)
        summary_info=[]
        for q in range(top_result):
            tc.emp('magenta', '>' + relevant_quotes[q])
            tc.emp('green', relevant_chapters[q])

            if langs is not None:
                # 因为语料都是按英文作对照的, 所以直接按英文句子查找到其它语言的句子就可以了
                # search_in_list('I write a letter.', ['ja', 'fa', 'id'])
                results=search_in_list(relevant_quotes[q], langs)
                if verbose:
                    tc.emp('blue', json.dumps(results, indent=2, ensure_ascii=False))

                if summary:
                    all_types = []
                    rs_c=CorpusSearcher.parse_controls(results)
                    for r in rs_c:
                        if r[2]!='':
                            tc.emp('red', f".. {r[2]}")
                        types=sents_summary(r[0], r[1])
                        all_types.extend(types)
                    summary_info.append((relevant_quotes[q], all_types))

            tc.emp('cyan', '✁', '-' * 30)

        for s in summary_info:
            tc.info(s)
コード例 #11
0
    def request_domains(self,
                        data: Dict[Text, Any],
                        print_format='table',
                        engine=None):
        import requests
        import json
        from sagas.conf.conf import cf
        from sagas.nlu.rules_meta import build_meta

        if engine is None:
            engine = cf.engine(data['lang'])
        data['engine'] = engine
        tc.info(f".. request is {data}")

        response = requests.post(f'{cf.servant(engine)}/verb_domains',
                                 json=data)
        rs = response.json()
        if len(rs) == 0:
            tc.info('.. verb_domains servant returns empty set.')
            tc.info('.. request data is', data)
            return None, None

        r = rs[0]
        # if print_format=='table':
        #     self.print_table(rs)
        # elif print_format=='jupyter':
        #     self.print_table(rs, False)
        if print_format != 'json':
            self.print_table(rs)
        else:
            tc.info(json.dumps(r, indent=2, ensure_ascii=False))

        domains = r['domains']
        # common = {'lemma': r['lemma'], 'word': r['word'],
        #           'stems': r['stems']}
        # meta = {'rel': r['rel'], **common, **data}

        meta = build_meta(r, data)
        return domains, meta
コード例 #12
0
ファイル: rules_obj_spec.py プロジェクト: CoderOverflow/stack
 def __init__(cls, clsname, superclasses, attributedict):
     # ruleset_stats =
     cls.rulesets = \
         [RuleSet('how_many_artifact_c',
                  rules=lambda d, m: [
                      # $ sz '你有几台笔记本电脑?'
                      Patterns(d, m, 5).verb(behaveof('have', 'v'), __engine='ltp',
                                             vob=intentof('how_many', 0.75)),
                      *actions_vob(d, m, [('have', 'device/artifact'), ]),
                  ],
                  executor=lambda obj: tc.info('red', f'.. object: {obj}'),
                  files=lambda d, m: [
                      # $ sz '有多少文件'
                      Patterns(d, m, 5).verb(behaveof('have', 'v'), __engine='ltp',
                                             a1=kindof('file/communication', 'n')),
                  ],
                  checkers=lambda d, m: [
                      *actions_vob([('have', 'device/artifact'), ]),
                  ],
                  ),
          ]
     BaseMeta.setup(cls)
コード例 #13
0
ファイル: nlu_tools.py プロジェクト: CoderOverflow/stack
    def contrast(self, text, source, target='en', word_map=None):
        """
        $ nlu contrast '저는 허락을 못 받아서 안 왔어요.' ko
        :param text:
        :param source:
        :param target:
        :return:
        """
        from sagas.nlu.translator import get_word_map
        from sagas.nlu.translator import translate
        from sagas.tool.misc import color_print

        r, tracker = translate(text, source=source, target=target, options={'get_pronounce'})
        tc.info(r)
        for i, p in enumerate(tracker.pronounce):
            ps = p[2:]
            tc.info(f'v{i}="{ps}"')
        rs, trans_table=get_word_map(source, target, text,
                                     words=word_map,
                                     local_translit=True if source in contrast_translit_langs else False)
        for i, (k, r) in enumerate(rs.items()):
            tc.info(f"{i} - ", r.replace('\n', ' '))

        color_print('cyan', ' '.join(trans_table))
コード例 #14
0
ファイル: rules_obj_spec.py プロジェクト: CoderOverflow/stack
 def callback(self, t):
     tc.info(f'** {self.name} callback', t)
     return self
コード例 #15
0
    def analyse_doc(self, sentence, node_maps=None, console=True):
        from sagas.nlu.uni_intf import sub_comps
        import unicodedata

        segs = []
        # omit {word.feats}
        if console:
            tc.info(*[f'index: {word.index}\ttext: {word.text+" "}\tlemma: {word.lemma}\tupos: {word.upos}\txpos: {word.xpos}' for word in sentence.words], sep='\n')
        else:
            # from IPython.display import display
            import sagas
            df=sagas.to_df([(word.index, word.text, word.lemma, word.upos, word.xpos, word.entity) for word in sentence.words],
                           ['index', 'text', 'lemma', 'upos', 'xpos', 'entity'])
            tc.dfs(df)

        def translit_chunk(chunk:str, lang):
            from sagas.nlu.transliterations import translits
            # if upos=='PUNCT':
            #     return chunk
            if chunk.strip() in (',','.',';','?','!'):
                return chunk
            # if lang in ('ko', 'ja', 'fa', 'hi', 'ar'):
            if translits.is_available_lang(lang):
                if sa_env.runtime!='default':
                    return word.text+'\n'+translits.translit(chunk, lang)
                return translits.translit(chunk, lang)
            return chunk

        if node_maps is None:
            node_maps = {}
            for word in sentence.words:
                pos_attrs=f"({word.upos.lower()}, {word.xpos.lower()})"
                node_text=word.text if self.translit_lang is None or word.upos=='PUNCT' \
                    else translit_chunk(word.text, self.translit_lang)
                # node_text=unicodedata.normalize('NFKC', node_text) if word.upos=='PUNCT' else node_text
                norm=lambda t: unicodedata.normalize('NFKC', t).encode('ascii', 'ignore').decode("utf-8")
                node_text = norm(node_text) if word.upos == 'PUNCT' else node_text
                if node_text=='':
                    node_text='_'
                # verbose
                if word.text!=node_text:
                    print('# ', f"{word.text} -> {node_text}")
                node_maps[word.text] = node_text if not self.enable_node_pos else f"{node_text}\\n{pos_attrs}"

                # self.f.attr(color='black')
        prop_sets = {'VERB': lambda f: f.attr('node', style='filled', color='lightgrey'),
                     'PRON': lambda f: f.attr('node', style='dashed', color='red'),
                     'AUX': lambda f: f.attr('node', style='dashed', color='green'),
                     'NOUN': lambda f: f.attr('node', style='solid', color='blue'),
                     }
        # sentence = doc.sentences[0]
        for word in sentence.words:
            rel = word.dependency_relation
            if rel in sub_comps:
                if word.upos == 'VERB':
                    self.f.attr('node', style='filled', color='antiquewhite')
                elif word.upos in prop_sets:
                    prop_sets[word.upos](self.f)
                else:
                    self.default_node()

            # for all languages
            elif rel.endswith('comp'):
                self.f.attr('node', style='filled', color='antiquewhite')
            elif word.upos in prop_sets:
                prop_sets[word.upos](self.f)
            else:
                self.default_node()

            head = ''
            if word.governor == 0:
                head = '_root_'
            else:
                head_word = sentence.words[word.governor - 1]
                head = head_word.text
            # print(f"{word.text} -> {rel}, {word.governor}, {head}")
            self.f.node(node_maps[word.text])
            segs.append(node_maps[word.text])

        # self.f.node_attr.update(color='black')
        self.default_node()
        self.print_dependencies(sentence, segs, node_maps)
        return self.f
コード例 #16
0
ファイル: misc.py プロジェクト: CoderOverflow/stack
def rs_represent(rs: List[Any], data: Dict[Text, Any], return_df=False):
    import sagas
    from sagas.nlu.rules import verb_patterns, aux_patterns, subj_patterns, predict_patterns
    from sagas.nlu.rules_lang_spec import langspecs
    from sagas.nlu.nlu_cli import NluCli
    from sagas.nlu.sinkers import Sinkers

    df_set = []
    result = []
    sinkers = Sinkers(data, rs[0]['type'])
    for serial, r in enumerate(rs):
        type_name = r['type']
        meta = build_meta(r, data)
        if type_name == 'verb_domains':
            theme = '[verb]'
            tc.info(
                serial_numbers[serial],
                theme,
                # r['lemma'], r['index'],
                f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}",
                '(%s, %s)' % (r['rel'], r['governor']))
            # meta = {'rel': r['rel'], **common, **data}
            verb_patterns(meta, r['domains'])
        elif type_name == 'aux_domains':
            theme = '[aux]'
            # 'rel': word.dependency_relation, 'governor': word.governor, 'head': dc.text
            delegator = '☇' if not r['delegator'] else '☌'
            tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'],
                    delegator, "%s(%s)" % (r['head'], r['head_pos']))
            # verb_patterns(r['domains'])
            # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data}
            aux_patterns(meta, r['domains'])
        elif type_name == 'subj_domains':
            theme = '[subj]'
            tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'], '☇',
                    f"{r['head']}")
            # verb_patterns(r['domains'])
            # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data}
            subj_patterns(meta, r['domains'])
        elif type_name == 'predicate':
            theme = '[predicates]'
            tc.info(serial_numbers[serial], theme,
                    f"{r['lemma']} ({r['phonetic']}, {r['word']})")
            # meta = {'rel': r['rel'], **common, **data}
            predict_patterns(meta, r['domains'])
        elif type_name == 'root_domains':
            theme = '[root]'
            tc.info(
                serial_numbers[serial], theme,
                f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}",
                '(%s, %s)' % (r['rel'], r['governor']))
            # meta = {'rel': r['rel'], **common, **data}
            # verb_patterns(meta, r['domains'])
            # check_langspec(data['lang'], meta, r['domains'], type_name)
        else:
            # meta = {}
            raise Exception(
                'Cannot process specific type: {}'.format(type_name))

        # process language special rules
        logger.debug(f"meta keys {meta.keys()}")
        mod_rs = langspecs.check_langspec(data['lang'], meta, r['domains'],
                                          type_name)
        sinkers.add_module_results(mod_rs)

        # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'children', 'features'])
        df = sagas.to_df(
            r['domains'],
            ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        df_set.append(df)

        if not return_df:

            result.extend(
                proc_word(type_name, r['word'],
                          r['head_word'] if 'head_word' in r else '',
                          data['lang']))
            result.extend(
                proc_children_column(df['rel'], df['children'], data['lang']))

            # print('.......')
            # where 1 is the axis number (0 for rows and 1 for columns.)
            # df = df.drop('children', 1)
            # df['children'] = df['children'].apply(lambda x: ', '.join(x)[:15] + "..")
            # df['features'] = df['features'].apply(lambda x: ', '.join(x)[:15] + "..")
            trunc_cols(df)
            tc.dfs(df)
            print_stem_chunks(r)

            if print_def:
                NluCli().get_word_def(r['lemma'], data['lang'])
            if print_synsets:
                r = display_synsets(theme, meta, r, data['lang'])
                result.extend(r)

    sinkers.process_with_sinkers()
    return result, df_set