def repl(options=None, show_prompt=False): while True: if show_prompt: sys.stdout.write("> ") sys.stdout.flush() line = sys.stdin.readline() if not line: break text = line.rstrip() if options and not options.strict_macron_mode: text = char.trans(text) # textutil.analyse_text(text, analyse_sentence) for sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)): analyse_sentence(sentence, options=options) if show_prompt: print
def main(): options = Options(sys.argv[1:]) if options.speech_mode: speak_latin.init_synth('Alex') latindic.load(auto_macron_mode=options.auto_macron_mode) if len(options.args) == 0: # repl mode if select.select([sys.stdin,],[],[],0.0)[0]: # have data from pipe. no prompt. repl(options=options) else: repl(options=options, show_prompt=True) else: # file mode for file in options.args: text = textutil.load_text_from_file(file) if options.strict_macron_mode: text = char.trans(text) # textutil.analyse_text(text, analyse_sentence, options=options) for sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)): analyse_sentence(sentence, options=options)
def analyse_text(text, options=None): # テキストを(句点などで)センテンスに切り分ける for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)): plain_text = ' '.join(word_surfaces_in_a_sentence) if options.echo_on: # print plain_text + "\n" print "\n" + ansi_color.underline(ansi_color.bold( plain_text )) + "\n" if options.speech_mode: speak_latin.say_latin(plain_text.decode('utf-8')) # unicodeに変換して word_surfaces_uc = [word_surface.decode('utf-8', 'strict') for word_surface in word_surfaces_in_a_sentence] # 辞書を引いてから words = lookup_all(word_surfaces_uc) # 先にlookup結果を表示してしまう if options and options.show_word_detail: print " --- " maxlen_uc = max([0] + [word.surface_len for word in words]) for i, word in enumerate(words): text = word.surface.encode('utf-8') print ' %2d ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1), word.detail() print " --- " print # 形容詞/属格の対応 words, visited_ix = detect_and_or(words) words, adj_ix = detect_adj_correspondances(words) words, gen_ix = detect_genitive_correspondances(words) for ix in visited_ix: words[ix] = None for ix in adj_ix: words[ix] = None for ix in gen_ix: words[ix] = None words = filter(identity, words) words, verb_ix = detect_verbs(words) words = detect_prep_domination(words) words = filter(identity, words) # print [word.surface_utf8() for word in words] print # 名詞句を述語動詞に結びつける verbs_ix = [] verb_count = 0 for i, word in enumerate(words): if isinstance(word, Predicate): verbs_ix.append(i) verb_count += 1 verb_surfaces = ', '.join([ansi_color.bold(words[ix].surface_utf8()) for ix in verbs_ix]) M = len(words) groups = [] if verb_count == 0: print ansi_color.underline("NO VERB FOUND.") groups.append( range(M) ) elif verb_count == 1: print ansi_color.underline("1 VERB FOUND:") + ' ' + verb_surfaces groups.append( range(M) ) else: print ansi_color.underline("%d VERBS FOUND:" % verb_count) + ' ' + verb_surfaces groups.append( range(verbs_ix[0]+1) ) # [0..ix0] for i in range(1, verb_count-1): groups.append( [verbs_ix[i]] ) groups.append( range(verbs_ix[verb_count-1], M) ) for i in range(verb_count-1): fr = groups[i][-1] + 1 to = groups[i+1][0] - 1 if fr == to: continue well_divided_at = None for j in range(fr, to+1): if words[j].surface == u',': well_divided_at = j break if well_divided_at is None: for j in range(fr, to+1): if words[j].surface == u'quod': well_divided_at = j-1 break if well_divided_at is None: for j in range(fr, to+1): if words[j].surface == u'et': well_divided_at = j-1 break if well_divided_at is not None: groups[i] += range(fr, well_divided_at+1) groups[i+1] = range(well_divided_at+1, to+1) + groups[i+1] else: print " NOT WELL: {%d..%d}" % (fr, to) # うまく分けられない。とりあえず後の方に入れる groups[i+1] = range(fr, to+1) + groups[i+1] print for i, group in enumerate(groups): if verb_count == 0: ws = [] for word in words: if isinstance(word, Word) and not word.items: continue # ws.append(word) dump(word) print " → ", translate(word) print # " --" # dump(ws) # print translate(ws) else: not_solved = [] # words_in_group = [words[ix] for ix in group] verb_ix = verbs_ix[i] pred = words[verb_ix] # predicate for j, ix in enumerate(group): if ix == verb_ix: continue word = words[ix] if isinstance(word, AndOr): pred.add_nominal(word.cases[0], word) elif isinstance(word, PrepClause): pred.add_nominal(word.prep, word) elif isinstance(word, Word): if not word.items: continue first_item = word.items[0] if j == 0 and word.surface in (u'quod', u'ut'): if word.items[1].pos == 'conj': first_item = word.items[1] word.items = word.items[1:] if first_item.pos == 'conj': if j < 2 and not pred.conjunction: pred.conjunction = word else: not_solved.append(word) elif first_item.pos == 'adv': if j < 2 and not pred.conjunction: pred.conjunction = word elif word.surface in (u'ō', u'Ō'): # 二重になってないかチェックする or conjunction を複数取る pred.conjunction = word else: pred.add_modifier(word) elif first_item._: cases = [x[0] for x in first_item._] case = None if 'Voc' in cases and ix > 0 and words[ix-1].surface in (u'ō', u'Ō'): case = 'Voc' # 形的にVocしかありえないケースも拾いたい else: for x in first_item._: if x[0] == 'Nom': if x[2] == 'n': case = 'Nom/Acc' else: case = x[0] break elif x[0] == 'Acc': case = x[0] break else: if not case: case = x[0] # if not case: case = case_n pred.add_nominal(case, word) else: # print "not solved += ", word.surface_utf8() not_solved.append(word) #(ix, word.surface_utf8())) if not_solved: print " NOT SOLVED:" # dump(not_solved, initial_indent=2) # print translate(not_solved) for item in not_solved: dump(item, 4) print " → ", translate(item) print dump(pred) print print " → ", translate(pred) print # 音読モードの場合、読み終わるまでウェイトを入れる if options.speech_mode: speak_latin.pause_while_speaking()
def main(): latindic.load(auto_macron_mode=False) show_title('original text') text = textutil.load_text_from_file('./latin.txt') print text[:1000], '...' print show_title('texts in base-form') texts_in_baseform = [] for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)): # print word_surfaces_in_a_sentence bases = base_forms_of_words(word_surfaces_in_a_sentence) texts_in_baseform.append(bases) for sentence in texts_in_baseform[:20]: print ' '.join([baseform.encode('utf-8') for baseform in sentence]) print '...' print show_title('[gensim] dictionary') dictionary = corpora.Dictionary(texts_in_baseform) # dictionary.save('/tmp/latintext.dict') # store the dictionary, for future reference # print dictionary print '{', for token, id in dictionary.token2id.items(): print '\"%s\": %d,' % (token.encode('utf-8'), id), print '}' # new_doc = "In Crētā īnsulā māgnum labyrinthum Daedalus aedificāvit plēnum viārum flexuōsārum." # new_bases = base_forms_of_words(new_doc.split()) # # print new_bases # new_vec = dictionary.doc2bow(new_bases) # print new_vec show_title('[gensim] corpus') corpus = [dictionary.doc2bow(text) for text in texts_in_baseform] # corpora.MmCorpus.serialize('/tmp/latintext.mm', corpus) # print corpus for doc in corpus[:20]: print doc print '...' print show_title('tf-idf') # term frequency * inverse document frequency tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model corpus_tfidf = tfidf[corpus] for i, doc in enumerate(corpus_tfidf): print doc if i == 20: break print '...' print # def decode_result(item, delim): def translate(token): # print "translating \"%s\"..." % token.encode('utf-8') items = latindic.lookup(token) return items[0]['ja'] if items else '*' latin_tokens = re.split(delim, item)[1::2] jas = [translate(token) for token in latin_tokens] return ' / '.join(jas) # print "\t", items[0]['ja'] NUM_TOPICS = 80 TOPICS_TO_TAKE = 10 show_title('LSI (Latent Semantic Indexing)') # initialize an LSI transformation lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPICS) # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi corpus_lsi = lsi[corpus_tfidf] topics = lsi.print_topics(TOPICS_TO_TAKE) for i, item in enumerate(topics): print "%d) %s" % (1+i, item.encode('utf-8')) print " ", decode_result(item, '"') print print show_title('LDA (Latent Dirichlet Allocation)') model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS) topics = model.show_topics(topics=TOPICS_TO_TAKE) for i, item in enumerate(topics): print "%d) %s" % (1+i, item.encode('utf-8')) print " ", decode_result(item, ' ?[*+]') print print