def print_r(r): df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) print('%s(%s)' % (r['type'], r['lemma'])) sagas.print_df(df) print_stem_chunks(r)
def deconstructing(self, text, target='ar'): """ $ python -m sagas.ar.arabic_processor deconstructing 'I am a student' $ python -m sagas.ar.arabic_processor deconstructing 'I am a student' de $ python -m sagas.ar.arabic_processor deconstructing 'I am a student' fr $ python -m sagas.ar.arabic_processor deconstructing 'I am a student' es $ python -m sagas.ar.arabic_processor deconstructing 'I am a student' vi ## other langs: ru, ja, zh :param text: :param target: :return: """ import sagas source = 'en' options = {'get_pronounce', 'get_translations'} res, t = translate(text, source=source, target=target, trans_verbose=False, options=options) print('✁', '%s(%s %s)' % (text, res, ''.join(t.pronounce))) for sent in text.split(' '): res, t = translate(sent, source=source, target=target, trans_verbose=False, options=options) # print('%s(%s%s)' % (sent, res, marks_th(t.pronounce)), end=" ") print('%s(%s%s)' % (sent, res, marks_th(t.pronounce))) sagas.print_df(t.translations) print('.')
def parse_print(self, sents, format='default'): """ $ python -m sagas.nlu.stanford_helper parse_print 'جارك رجل طيب' :param sents: :param format: :return: """ import json import sagas # TEXT = 'جارك رجل طيب' # host = 'pc' ann = self.invoke_server( sents, 'arabic', output_format='text' if format == 'default' else 'json') if format == 'default': print(ann.strip()) elif format == 'json': print(', '.join(ann['sentences'][0].keys())) print(json.dumps(ann, indent=2, ensure_ascii=False)) elif format == 'df': tokens = ann['sentences'][0]['tokens'] sagas.print_df(sagas.dict_df(tokens))
def analyse_ar(text, disp_df=False): import sagas target = 'en' if disp_df: sagas.print_df(process_df('ar', target, text, with_styles=False)) else: process('ar', target, text) NluTools().say(text, 'ar')
def display(df, col_defs=None): if outputer == 'console': if col_defs is not None: for col in col_defs: crop_column(df, col[0], col[1]) sagas.print_df(df) else: from IPython.display import display display(df)
def all_sources(self, s): """ $ python -m sagas.nlu.trans_cacher all_sources vi :param s: :return: """ import sagas rs = [] for r in cacher.coll.find({'source': s}): rs.append((r['text'], r['target'])) sagas.print_df(sagas.to_df(rs, ['text', 'target']))
def testings(self): """ $ python -m sagas.nlu.corenlp_procs testings :return: """ ds = [ words_table('عمري تسعة عشر عاماً.', 'ar'), words_table('آخرین کسی که به کامپیوتر وصل شد، کی بود؟', 'fa') ] for df in ds: sagas.print_df(df)
def simple(): from pymongo import MongoClient import sagas # uri = 'mongodb://samlet.com/langs' uri = 'mongodb://localhost/langs' # uri = 'mongodb://192.168.0.101/langs' client = MongoClient(uri) db = client.get_default_database() print(db.name) rs = [] for r in db.trans.find({'source': 'id'}): rs.append((r['text'], r['target'])) sagas.print_df(sagas.to_df(rs, ['text', 'target']))
def tests(self): """ $ python -m sagas.nlu.translit_ar tests :return: """ import pandas as pd import sagas inputfile1 = 'data/phrases.csv' inputfile2 = 'data/nouns.csv' inputdf = pd.read_csv(inputfile1) outputdf = self.transliterate_df(inputdf) d = pd.concat([inputdf, outputdf], axis=1) sagas.print_df(d)
def ents(self, sents, lang='en', simple=True): """ $ python -m sagas.nlu.spacy_procs ents 'New York' $ python -m sagas.nlu.spacy_procs ents 'I am from China' $ python -m sagas.nlu.spacy_procs ents "Ada Lovelace was born in London" :param sents: :param lang: :return: """ import sagas rs = [] doc = self.spacy_doc(sents, lang, simple=simple) for ent in doc.ents: rs.append((ent.text, ent.start_char, ent.end_char, ent.label_, ent.kb_id_)) r = sagas.to_df(rs, ['word', 'start', 'end', 'entity', 'kb']) sagas.print_df(r)
def rs_summary(rs, console=True): from sagas.tool.misc import print_stem_chunks from IPython.display import display import sagas for serial, r in enumerate(rs): df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) if 'head' in r: cla = "%s(%s)" % (r['head'], r['head_pos']) else: cla = '_' print('%s(%s)' % (r['type'], r['lemma']), cla) # sagas.print_df(df) if not console: display(df) else: sagas.print_df(df) print_stem_chunks(r)
def list_chunk_entities(self, sents, lang='en'): """ $ python -m sagas.nlu.chunk_entities list_chunk_entities 'Apple is looking at buying U.K. startup for $1 billion.' $ python -m sagas.nlu.chunk_entities list_chunk_entities "Where's the president?" $ python -m sagas.nlu.chunk_entities list_chunk_entities "διαμένω στη Νέα Υόρκη" el :param sents: :return: """ import sagas doc = self.core_nlp(sents) doc_s = doc.sentences[0] tokens = tokenize(sents, doc_s) for tok in tokens: print(tok.index, '\t', tok.word, tok.word_offset, tok.positions) ent_pos = self.entity_positions(sents, lang) print(ent_pos) # process spans and overlaps chunks = [] r = self.get_verb_domain(doc.sentences[0]) # r = self.get_chunks(doc.sentences[0]) if len(r) > 0: for el in r[0]['domains']: span_id = el[0] span_pos = el[4] start_mark = tokens[span_pos[0] - 1] end_mark = tokens[span_pos[-1] - 1] word_range = [ start_mark.positions['start'], end_mark.positions['end'] ] entities = get_included_entities(word_range, ent_pos) chunks.append((span_id, span_pos, word_range, sents[word_range[0]:word_range[1]], [ent['entity'] for ent in entities])) df = sagas.to_df( chunks, ['rel', 'positions', 'range', 'chunk text', 'entities']) sagas.print_df(df[['rel', 'chunk text', 'entities']]) else: # print("no chunks.") print("no verbs.")
def nltk_locales(self): """ $ python -m sagas.nlu.locales nltk_locales :return: """ from nltk.corpus import wordnet as wn from iso639 import languages import sagas langs = wn.langs() print(len(langs), sorted(langs)) rs = [] excepts = ['qcn'] for lang in langs: if lang not in excepts: loc = languages.get(part3=lang) rs.append((loc.part3, loc.macro, loc.name)) df=sagas.to_df(rs, ['code', 'micro', 'name']) sagas.print_df(df)
def get_word_trans(self, word, lang, pos='*'): import sagas from sagas.nlu.translator import translate, with_words, WordsObserver r, t = translate(word, source=lang, target='en', options={'get_pronounce'}, tracker=with_words()) if r: word_r = r.lower() tc.emp('cyan', f"1. translate: {word_r}") obs: WordsObserver = t.observer(WordsObserver) dfs = obs.trans_dfs if dfs: tc.emp('cyan', f"2. candidates: {obs.get_axis(word_r, pos)}") for k, df in dfs.items(): print(f"- pos:{k} -") sagas.print_df(df) else: tc.emp('cyan', f"2. no candidates.") return word_r return ''
def disp_by_offset(self, lang, offset, pos='n'): """ $ python -m sagas.nlu.omw_extended disp_by_offset ru 9918554 $ python -m sagas.nlu.omw_extended disp_by_offset de 9918554 :param offset: :return: """ import sagas offset = str(offset) id = '%s-%s' % (offset.zfill(8), pos) rs = [] print('search for', id) if lang in langsets: data = self.load_dicts(lang) for row in data: if row[0] == id: rs.append((row[0], row[2])) df = sagas.to_df(rs, ['id', 'word']) sagas.print_df(df) else: print('no data.')
def defs(self): """ $ python -m sagas.zh.ltp_meta defs :return: """ sagas.print_df(self.roles_df) sagas.print_df(self.dep_defs) sagas.print_df(self.pos_defs)
def verb_domains(self, sents, lang='en'): """ $ python -m sagas.nlu.uni_parser verb_domains "Barack Obama was born in Hawaii." en # 我有一只阿比西尼亚猫 $ python -m sagas.nlu.uni_parser verb_domains "I have an Abyssinian cat." en $ python -m sagas.nlu.uni_parser verb_domains 'Что ты обычно ешь на ужин?' ru $ python -m sagas.nlu.uni_parser verb_domains 'Die Zeitschrift erscheint monatlich.' de # 测试多个动词(过滤掉从句的动词): $ python -m sagas.nlu.uni_parser verb_domains 'Tu as choisi laquelle tu vas manger ?' fr :param sents: :param lang: :return: """ from sagas.nlu.corenlp_helper import get_nlp serial_numbers = '❶❷❸❹❺❻❼❽❾❿' nlp = get_nlp(lang) doc = nlp(sents) # 分析依赖关系, 自下而上, 可用于抽取指定关系的子节点集合, 比如此例中的'nsubj:pass'和'obl' # word.governor即为当前word的parent sent = doc.sentences[0] rs = get_verb_domain(sent) # r=rs[0] for num, r in enumerate(rs): # print(json.dumps(r, indent=2, ensure_ascii=False)) print(serial_numbers[num], '-' * 50) # print(r['verb'], r['index']) print(r['word'], r['index']) # df=sagas.to_df(r[0]['domains'], ['rel', 'index', 'text', 'children']) df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) sagas.print_df(df) for stem in r['stems']: if stem[0] == 'obj': print('object ->', ' '.join(stem[1]))
def all_voices(self, lang=None): """ $ python -m sagas.nlu.nlu_tools all_voices $ nlu all-voices ru :return: """ import pyttsx3 import sagas engine = pyttsx3.init() voices: collections.Iterable = engine.getProperty('voices') rs=[] for voice in voices: if lang is not None: if voice.languages[0].startswith(lang): print(voice) else: print(voice, voice.id, voice.languages[0]) rs.append((voice.id.replace('com.apple.speech.synthesis.',''), voice.name, voice.languages, voice.gender )) rs=sorted(rs, key=lambda el: el[2][0]) sagas.print_df(sagas.to_df(rs, ['id', 'name', 'lang', 'gender']))
def dfs(self, *args): import sagas for arg in args: # print(arg) sagas.print_df(arg)