def analyse_sentence(surfaces, options=None): # words: string(utf-8) text = ' '.join(surfaces) if options.echo_on: print ansi_color.ANSI_UNDERLINE_ON + ansi_color.ANSI_BOLD_ON + \ text + \ ansi_color.ANSI_BOLD_OFF + ansi_color.ANSI_UNDERLINE_OFF print if options.speech_mode: speak_latin.say_latin(text.decode('utf-8')) surfaces_uc = [surface.decode('utf-8') for surface in surfaces] # words = [Word(surface, items) for surface, items in lookup_all_words(words_uc)] words = lookup_all(surfaces_uc) # dump_res(res) # util.pp(map(lambda r:r[0], res)) for i, sentence in enumerate(split_sentence_by_verb(words)): # sentence.count_patterns() # 前置詞の格支配を利用して絞り込む sentence.prep_constraint() # sentence.dot('_'.join([word.surface.encode('utf-8') for word in sentence.words])) # 属格支配する形容詞 sentence.genitive_domination() # 形容詞などの性・数・格一致を利用して絞り込む sentence.modifier_constraint() # sentence.count_patterns() # 属格がどこにかかるか sentence.genitive_constraint() if options and options.show_translation: sentence.translate() if options and options.show_word_detail: if options.show_translation: print " ---" sentence.dump() print if options.speech_mode: speak_latin.pause_while_speaking()
def analyse_text(text, options=None): # テキストを(句点などで)センテンスに切り分ける for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)): plain_text = ' '.join(word_surfaces_in_a_sentence) if options.echo_on: # print plain_text + "\n" print "\n" + ansi_color.underline(ansi_color.bold( plain_text )) + "\n" if options.speech_mode: speak_latin.say_latin(plain_text.decode('utf-8')) # unicodeに変換して word_surfaces_uc = [word_surface.decode('utf-8', 'strict') for word_surface in word_surfaces_in_a_sentence] # 辞書を引いてから words = lookup_all(word_surfaces_uc) # 先にlookup結果を表示してしまう if options and options.show_word_detail: print " --- " maxlen_uc = max([0] + [word.surface_len for word in words]) for i, word in enumerate(words): text = word.surface.encode('utf-8') print ' %2d ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1), word.detail() print " --- " print # 形容詞/属格の対応 words, visited_ix = detect_and_or(words) words, adj_ix = detect_adj_correspondances(words) words, gen_ix = detect_genitive_correspondances(words) for ix in visited_ix: words[ix] = None for ix in adj_ix: words[ix] = None for ix in gen_ix: words[ix] = None words = filter(identity, words) words, verb_ix = detect_verbs(words) words = detect_prep_domination(words) words = filter(identity, words) # print [word.surface_utf8() for word in words] print # 名詞句を述語動詞に結びつける verbs_ix = [] verb_count = 0 for i, word in enumerate(words): if isinstance(word, Predicate): verbs_ix.append(i) verb_count += 1 verb_surfaces = ', '.join([ansi_color.bold(words[ix].surface_utf8()) for ix in verbs_ix]) M = len(words) groups = [] if verb_count == 0: print ansi_color.underline("NO VERB FOUND.") groups.append( range(M) ) elif verb_count == 1: print ansi_color.underline("1 VERB FOUND:") + ' ' + verb_surfaces groups.append( range(M) ) else: print ansi_color.underline("%d VERBS FOUND:" % verb_count) + ' ' + verb_surfaces groups.append( range(verbs_ix[0]+1) ) # [0..ix0] for i in range(1, verb_count-1): groups.append( [verbs_ix[i]] ) groups.append( range(verbs_ix[verb_count-1], M) ) for i in range(verb_count-1): fr = groups[i][-1] + 1 to = groups[i+1][0] - 1 if fr == to: continue well_divided_at = None for j in range(fr, to+1): if words[j].surface == u',': well_divided_at = j break if well_divided_at is None: for j in range(fr, to+1): if words[j].surface == u'quod': well_divided_at = j-1 break if well_divided_at is None: for j in range(fr, to+1): if words[j].surface == u'et': well_divided_at = j-1 break if well_divided_at is not None: groups[i] += range(fr, well_divided_at+1) groups[i+1] = range(well_divided_at+1, to+1) + groups[i+1] else: print " NOT WELL: {%d..%d}" % (fr, to) # うまく分けられない。とりあえず後の方に入れる groups[i+1] = range(fr, to+1) + groups[i+1] print for i, group in enumerate(groups): if verb_count == 0: ws = [] for word in words: if isinstance(word, Word) and not word.items: continue # ws.append(word) dump(word) print " → ", translate(word) print # " --" # dump(ws) # print translate(ws) else: not_solved = [] # words_in_group = [words[ix] for ix in group] verb_ix = verbs_ix[i] pred = words[verb_ix] # predicate for j, ix in enumerate(group): if ix == verb_ix: continue word = words[ix] if isinstance(word, AndOr): pred.add_nominal(word.cases[0], word) elif isinstance(word, PrepClause): pred.add_nominal(word.prep, word) elif isinstance(word, Word): if not word.items: continue first_item = word.items[0] if j == 0 and word.surface in (u'quod', u'ut'): if word.items[1].pos == 'conj': first_item = word.items[1] word.items = word.items[1:] if first_item.pos == 'conj': if j < 2 and not pred.conjunction: pred.conjunction = word else: not_solved.append(word) elif first_item.pos == 'adv': if j < 2 and not pred.conjunction: pred.conjunction = word elif word.surface in (u'ō', u'Ō'): # 二重になってないかチェックする or conjunction を複数取る pred.conjunction = word else: pred.add_modifier(word) elif first_item._: cases = [x[0] for x in first_item._] case = None if 'Voc' in cases and ix > 0 and words[ix-1].surface in (u'ō', u'Ō'): case = 'Voc' # 形的にVocしかありえないケースも拾いたい else: for x in first_item._: if x[0] == 'Nom': if x[2] == 'n': case = 'Nom/Acc' else: case = x[0] break elif x[0] == 'Acc': case = x[0] break else: if not case: case = x[0] # if not case: case = case_n pred.add_nominal(case, word) else: # print "not solved += ", word.surface_utf8() not_solved.append(word) #(ix, word.surface_utf8())) if not_solved: print " NOT SOLVED:" # dump(not_solved, initial_indent=2) # print translate(not_solved) for item in not_solved: dump(item, 4) print " → ", translate(item) print dump(pred) print print " → ", translate(pred) print # 音読モードの場合、読み終わるまでウェイトを入れる if options.speech_mode: speak_latin.pause_while_speaking()