Ejemplo n.º 1
0
def get_corpus(lang:Text, chapter:Text):
    dfjson = pd.read_json(f'~/pi/stack/crawlers/langcrs/all_{lang}.json')
    ch=dfjson[dfjson['chapter'].str.match(chapter)]
    rs=[]
    for i, (sent,ref) in enumerate(zip(ch['translate'], ch['text'])):
        rs.append(fix_sents(sent, lang))
    return rs
Ejemplo n.º 2
0
def build_anal_tree(sents: Text,
                    lang: Text,
                    engine: Text,
                    nodecls=None,
                    docimpl=None):
    """
    >>> from sagas.nlu.anal import build_anal_tree
    >>> from anytree.search import findall, findall_by_attr
    >>> f=build_anal_tree(sents, lang, engine)
    >>> words = findall_by_attr(f, name='upos', value='VERB')
    >>> objs = findall(words[0], filter_=lambda n: n.dependency_relation in ("obj"))

    :param sents:
    :param lang:
    :param engine:
    :return:
    """
    from sagas.nlu.uni_remote import dep_parse
    from sagas.nlu.utils import fix_sents

    sents = fix_sents(sents, lang)
    # doc is SentenceIntf
    doc, resp = dep_parse(sents,
                          lang=lang,
                          engine=engine,
                          pipelines=['predicts'],
                          doc_impl=docimpl)
    predicts = resp['predicts'] if resp and 'predicts' in resp else []
    return from_doc(doc, lang, engine, nodecls, predicts)
Ejemplo n.º 3
0
def get_records(lang, chapter, field, fix=False):
    from sagas.tool.intents_tool import intents_tool
    text_list = [(
        doc[field] if not fix else fix_sents(doc[field], lang),
        doc['text']
        if lang != 'en' else f"{doc['chapter'].lower()[:10]}_{doc['index']}",
        doc['intent'] if 'intent' in doc else '',
    ) for doc in intents_tool.db.corpus.find({'chapter': chapter})]
    return text_list
Ejemplo n.º 4
0
def parse_comps(sents, source):
    sents = fix_sents(sents, source)

    engine = cf.engine(source)
    doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts'])
    if len(resp['predicts']) > 0:
        rs = resp['predicts']
    else:
        rs = get_chunks(doc_jsonify)
    return rs
Ejemplo n.º 5
0
def parse_deps(text, lang, translit=None):
    text = fix_sents(text, lang)
    engine = cf.engine(lang)
    # g = sentence_view(lang, text, engine=engine, translit_lang=lang, enable_contrast=True)
    doc_jsonify, resp = dep_parse(text, lang, engine, ['predicts'])
    if doc_jsonify is not None:
        list_chunks(doc_jsonify, resp, lang, enable_contrast=True)
        g = display_doc_deps(doc_jsonify, resp, translit_lang=lang)

        st.graphviz_chart(g)
        if translit is not None:
            st.text(f"♤ {translit}")

        words = [word.text for word in doc_jsonify.words]
        tools.contrast(text, lang, word_map=words)
Ejemplo n.º 6
0
def proc_corpus(lang, chapter):
    """
    >>> proc_corpus('ko', 'At school')
    :param lang:
    :param chapter:
    :return:
    """
    dfjson = pd.read_json(f'~/pi/stack/crawlers/langcrs/all_{lang}.json')
    ch = dfjson[dfjson['chapter'].str.match(chapter)]
    rs_map = {}
    for i, (sent, ref) in enumerate(zip(ch['translate'], ch['text'])):
        text = fix_sents(sent, lang)
        print('-', i, ref, '-' * 10)
        # digest(text, 'spacy')
        rs_map[i] = digest_verb(text, lang, 'stanza')
    return len(ch), len([t for t in rs_map.values() if t])
Ejemplo n.º 7
0
    def list_chapter_text(self, lang, chapter, fix=False):
        """
        >>> intents_tool.list_chapter_text('fr', 'People', True)
        >>> intents_tool.list_chapter_text('en', 'People')

        :param lang:
        :param chapter:
        :param fix:
        :return:
        """
        field = f'lang_{lang}' if lang != 'en' else 'text'
        print(*[
            doc[field] if not fix else fix_sents(doc[field], lang)
            for doc in self.db.corpus.find({'chapter': chapter})
        ],
              sep='\n')
Ejemplo n.º 8
0
    def clip_parse(self, source, sents='', specified='default', do_test=False):
        """
        >> clip text: ‫یک آبجو مى خواهم.‬
        $ nlu clip_parse fa
        $ engine='stanford' nluc ar
        $ nlu clip_parse fi 'Tuolla ylhäällä asuu vanha nainen.'
        $ nluc nl 'De vrouw heeft verschillende appels.'
        $ nluc id 'Ini adalah judul buku yang saya baca.' aux
        $ nluc fi 'Voiko täältä lainata aurinkovarjoa?' default True

        :param source:
        :return:
        """
        from sagas.nlu.uni_remote import dep_parse
        from sagas.nlu.common import get_from_clip
        from sagas.conf.conf import cf
        from sagas.nlu.uni_remote_viz import list_chunks
        from sagas.nlu.utils import fix_sents

        if sents=='':
            sents = get_from_clip()
            if sents.strip()=='':
                tc.info('no text avaliable in clipboard.')
                return
        sents=fix_sents(sents, source)
        tc.info(sents)

        # Parse the sentence and display it's chunks, domains and contrast translations.
        engine=cf.engine(source)
        doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts'])
        if doc_jsonify is None:
            raise Exception(f'Cannot parse sentence for lang {source}')

        list_chunks(doc_jsonify, resp, source,
                    enable_contrast=True,
                    specified=None if specified=='default' else specified)
        words = [word.text for word in doc_jsonify.words]
        self.contrast(sents, source, word_map=words)

        ## visual tree
        self.main_domains(sents, source, engine, False)
        ## add rulesets procs
        from sagas.nlu.inferencer import do_infers
        cli_cmd, pats = do_infers(sents, source)
        if do_test:
            for pat in pats:
                self.check_rule(sents, source, pat)
Ejemplo n.º 9
0
def sents_summary(sents, source):
    from sagas.nlu.uni_remote import dep_parse
    from sagas.nlu.uni_remote_viz import list_contrast
    from sagas.conf.conf import cf
    from sagas.nlu.utils import fix_sents
    from sagas.nlu.uni_parser import get_chunks

    sents=fix_sents(sents, source)
    engine=cf.engine(source)
    doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts'])
    types=[]
    if doc_jsonify is None:
        raise Exception(f'Cannot parse sentence for lang {source}')
    if len(resp['predicts']) > 0:
        rs=resp['predicts']
    else:
        rs = get_chunks(doc_jsonify)

    for serial, r in enumerate(rs):
        print(f"{serial}. {r['type']} -> {r['word']}")
        types.append(f"{source}:{r['type']}")
    list_contrast(rs, source)
    return types
Ejemplo n.º 10
0
def parse_sents(data: Dict):
    sents, source, engine = data['sents'], data['lang'], data['engine']
    sents = fix_sents(sents, source)
    # engine=cf.engine(source)
    return parse_and_cache(sents, source, engine)
Ejemplo n.º 11
0
def fix_data(data):
    if 'engine' not in data:
        data['engine'] = cf.engine(data['lang'])
    data['sents']=fix_sents(data['sents'], data['lang'])
    return data
Ejemplo n.º 12
0
    def trans_clip(self,
                   source='auto',
                   targets='zh-CN;ja',
                   says=None,
                   details=True,
                   sents='',
                   deps=''):
        """
        $ trans
        $ trans auto en
        $ trans ru en
        $ trans ru 'zh-CN;ja'
        $ trans-ru
        $ trans-rus

        $ alias sp="python -m sagas.tool.misc trans_clip pt 'en;it;ja' ja False"
        $ sp 'O homem fica amarelo.'
        $ sa 'أنا متأسف.'
        $ sf "La similitude entre ces deux phrases" 'ja;zh;id'
        $ sz '这两句话的相似程度' en
        $ sz '这两句话的相似程度' 'en;fr;ar;ja;fa'
        $ engine=spacy se 'I like to eat cucumber.'
        $ sj '足にひどい痛みを感じました。'  # multiple predicates

        :return:
        """
        import clipboard
        from sagas.nlu.nlu_cli import NluCli
        from sagas.nlu.nlu_tools import NluTools

        ascii_incompatibles = ['zh', 'ja', 'ko', 'ar', 'fa']

        if sents != '':
            text = sents
            interact_mode = False
        else:
            text = clipboard.paste()
            text = text.replace("\n", "")
            interact_mode = True

        # remove spaces if lang is ja/zh
        # if source in ('ja','zh'):
        #     text=text.replace(' ','')
        text = fix_sents(text, source)
        engine = cf.engine(source)
        tc.emp('yellow', f".. parse with {engine}: ({text})")

        # events
        from sagas.nlu.events import init_reps
        init_reps()

        # add at 2019.9.15
        ascii_gs = []
        if self.enable_ascii_viz:
            rt = NluCli().ascii_viz(text, source, engine=engine)
            if source not in ascii_incompatibles:
                ascii_gs.extend(rt.split('\n'))
            print(rt)
        # target_sents=[]
        # sents_map={}
        ctx = TransContext(source, targets, text, says, deps)
        # print('❣', text)
        if source != 'auto':
            # text = fix_sents(source, text)
            ctx.sents_map[source[:2]] = text

        # addi_pronounce=[]
        # succ=self.trans_google(ctx)
        succ = self.translators[self.translator](ctx)

        if not succ:
            return
        # if len(addi_pronounce)>0:
        #     target_sents.extend(addi_pronounce)

        ## addons
        if self.enable_chunks_parse:
            addons, result = self.parse_chunks(text,
                                               source,
                                               targets,
                                               ctx,
                                               details=details)
        else:
            addons = []
            # result = '\n\t'.join([text] + ctx.target_sents)
            lines = []
            lines.append(f'\t.sent({source}="{text}"')
            suffix = ") \\"
            result = ', \n\t      '.join(lines + ctx.target_sents + [suffix])
            print(result)

        # other langs dep-parse
        if self.enable_ascii_viz and deps != '':
            for t in deps.split(';'):
                if t in ctx.sents_map:
                    rt = NluCli().ascii_viz(ctx.sents_map[t],
                                            t,
                                            engine=cf.engine(t))
                    # ascii_gs.extend(rt.split('\n'))
                    print(rt)
                else:
                    color_print(
                        'red',
                        f".. the lang {t} for dep-parse is not available in translated list."
                    )

        # do infers
        from sagas.nlu.inferencer import do_infers
        cli_cmd, pats = do_infers(text, source)

        # tools
        tools = NluTools()
        # if cf.is_enabled('print_tree'):  # move to events
        #     tools.main_domains(text, lang=source, engine=engine, print_domains=False)

        # copy to clipboard
        if interact_mode:
            result = result + '\n\t'
            if self.append_ascii_viz and len(ascii_gs) > 0:
                result = result + '\n\t'.join(ascii_gs)

            if len(addons) > 0:
                # result=result+'\n\t'+'\n\t'.join(addons)
                result = result + '\n\t'.join(addons)
            if pats:
                result = '\n\t'.join([result, cli_cmd, '\n\t'.join(pats)])
            if self.enable_chunks_parse:
                result = result + '\n'
            # clipboard.copy(result+'\n')
            clipboard.copy(result)

        if interact_mode and says is not None:
            tools.say(ctx.sents_map[says], says)