def render_with_indent(indent, obj): if isinstance(obj, Sentence): for word in obj.words: render_with_indent(indent+2, word) elif isinstance(obj, AndOr): print ' '*indent + '[' + obj.and_or_word.encode('utf-8') + '] ' # + str(obj._) for words in obj.words_slots: render_with_indent(indent+2, words[0]) elif isinstance(obj, PrepClause): # print ' '*indent + obj.item.surface.encode('utf-8') + ' ' + obj.item.ja + ' <'+ obj.dominated_case + '>' print ' '*indent + obj.item.surface.encode('utf-8') + ' <'+ obj.dominated_case + '>' for word in obj.words: render_with_indent(indent+2, word) elif isinstance(obj, Word): if not obj.items: return text = decolate(obj) if obj.items[0].pos in ('conj', 'adv'): text = '(' + text + ')' print ' '*indent + text # word.surface.encode('utf-8') for gen in obj.genitives: render_with_indent(indent+2, gen) for mod in obj.modifiers: render_with_indent(indent+2, mod) # print ' ' + mod.surface.encode('utf-8') elif isinstance(obj, Predicate): text = obj.surface.encode('utf-8') text = ansi_color.underline(ansi_color.bold(text, ansi_color.RED)) print ' '*indent + text, "(%s %s%s)" % (obj.mood(), str(obj.person()), obj.number()) if obj.conjunction: render_with_indent(indent+2, obj.conjunction) for mod in obj.modifiers: render_with_indent(indent+2, mod) for case, objs in obj.case_slot.items(): if isinstance(case, unicode): print ' '*(indent+2) + "prep:" else: print ' '*(indent+2) + case + ":" for obj in objs: render_with_indent(indent+4, obj) elif isinstance(obj, list): for item in obj: render_with_indent(indent+2, item)
def dump(self): # (表示用に)単語の最大長を得ておく maxlen_uc = max([0] + [word.surface_len for word in self.words]) for i, word in enumerate(self.words): is_verb = False if word.is_verb(): color = ansi_color.RED is_verb = True # if verb_count == 0: # st['predicate'] = item # elif items and any([item['pos'] in ['noun','pronoun'] for item in items]): elif word.has_subst_case('Nom'): color = ansi_color.BLUE elif word.has_subst_case('Acc'): color = ansi_color.BLACK elif word.has_subst_case('Gen'): color = ansi_color.GREEN elif word.has_subst_case('Abl'): color = ansi_color.YELLOW elif word.has_subst_case('Dat'): color = ansi_color.MAGENTA else: color = None # ansi_color.DEFAULT text = word.surface.encode('utf-8') #1 print "/%s/ %s" % (text, str(color)) # text = (u'%*s' % (-maxlen_uc, surface)).encode('utf-8') if color is not None: text = ansi_color.bold(text, color) if is_verb: text = ansi_color.underline(text) print ' %2d ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1), if word.items is None: print elif word.items == []: print '(?)' else: print ' | '.join([item.description() for item in word.items]) print
def decolate(word): def color_for_word(word): if word.has_subst_case('Nom'): color = ansi_color.BLUE elif word.has_subst_case('Acc'): color = ansi_color.BLACK elif word.has_subst_case('Gen'): color = ansi_color.GREEN elif word.has_subst_case('Abl'): color = ansi_color.YELLOW elif word.has_subst_case('Dat'): color = ansi_color.MAGENTA else: color = None # ansi_color.DEFAULT return color color = color_for_word(word) text = word.surface.encode('utf-8') if color is not None: return ansi_color.bold(text, color) else: return text
def analyse_text(text, options=None): # テキストを(句点などで)センテンスに切り分ける for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)): plain_text = ' '.join(word_surfaces_in_a_sentence) if options.echo_on: # print plain_text + "\n" print "\n" + ansi_color.underline(ansi_color.bold( plain_text )) + "\n" if options.speech_mode: speak_latin.say_latin(plain_text.decode('utf-8')) # unicodeに変換して word_surfaces_uc = [word_surface.decode('utf-8', 'strict') for word_surface in word_surfaces_in_a_sentence] # 辞書を引いてから words = lookup_all(word_surfaces_uc) # 先にlookup結果を表示してしまう if options and options.show_word_detail: print " --- " maxlen_uc = max([0] + [word.surface_len for word in words]) for i, word in enumerate(words): text = word.surface.encode('utf-8') print ' %2d ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1), word.detail() print " --- " print # 形容詞/属格の対応 words, visited_ix = detect_and_or(words) words, adj_ix = detect_adj_correspondances(words) words, gen_ix = detect_genitive_correspondances(words) for ix in visited_ix: words[ix] = None for ix in adj_ix: words[ix] = None for ix in gen_ix: words[ix] = None words = filter(identity, words) words, verb_ix = detect_verbs(words) words = detect_prep_domination(words) words = filter(identity, words) # print [word.surface_utf8() for word in words] print # 名詞句を述語動詞に結びつける verbs_ix = [] verb_count = 0 for i, word in enumerate(words): if isinstance(word, Predicate): verbs_ix.append(i) verb_count += 1 verb_surfaces = ', '.join([ansi_color.bold(words[ix].surface_utf8()) for ix in verbs_ix]) M = len(words) groups = [] if verb_count == 0: print ansi_color.underline("NO VERB FOUND.") groups.append( range(M) ) elif verb_count == 1: print ansi_color.underline("1 VERB FOUND:") + ' ' + verb_surfaces groups.append( range(M) ) else: print ansi_color.underline("%d VERBS FOUND:" % verb_count) + ' ' + verb_surfaces groups.append( range(verbs_ix[0]+1) ) # [0..ix0] for i in range(1, verb_count-1): groups.append( [verbs_ix[i]] ) groups.append( range(verbs_ix[verb_count-1], M) ) for i in range(verb_count-1): fr = groups[i][-1] + 1 to = groups[i+1][0] - 1 if fr == to: continue well_divided_at = None for j in range(fr, to+1): if words[j].surface == u',': well_divided_at = j break if well_divided_at is None: for j in range(fr, to+1): if words[j].surface == u'quod': well_divided_at = j-1 break if well_divided_at is None: for j in range(fr, to+1): if words[j].surface == u'et': well_divided_at = j-1 break if well_divided_at is not None: groups[i] += range(fr, well_divided_at+1) groups[i+1] = range(well_divided_at+1, to+1) + groups[i+1] else: print " NOT WELL: {%d..%d}" % (fr, to) # うまく分けられない。とりあえず後の方に入れる groups[i+1] = range(fr, to+1) + groups[i+1] print for i, group in enumerate(groups): if verb_count == 0: ws = [] for word in words: if isinstance(word, Word) and not word.items: continue # ws.append(word) dump(word) print " → ", translate(word) print # " --" # dump(ws) # print translate(ws) else: not_solved = [] # words_in_group = [words[ix] for ix in group] verb_ix = verbs_ix[i] pred = words[verb_ix] # predicate for j, ix in enumerate(group): if ix == verb_ix: continue word = words[ix] if isinstance(word, AndOr): pred.add_nominal(word.cases[0], word) elif isinstance(word, PrepClause): pred.add_nominal(word.prep, word) elif isinstance(word, Word): if not word.items: continue first_item = word.items[0] if j == 0 and word.surface in (u'quod', u'ut'): if word.items[1].pos == 'conj': first_item = word.items[1] word.items = word.items[1:] if first_item.pos == 'conj': if j < 2 and not pred.conjunction: pred.conjunction = word else: not_solved.append(word) elif first_item.pos == 'adv': if j < 2 and not pred.conjunction: pred.conjunction = word elif word.surface in (u'ō', u'Ō'): # 二重になってないかチェックする or conjunction を複数取る pred.conjunction = word else: pred.add_modifier(word) elif first_item._: cases = [x[0] for x in first_item._] case = None if 'Voc' in cases and ix > 0 and words[ix-1].surface in (u'ō', u'Ō'): case = 'Voc' # 形的にVocしかありえないケースも拾いたい else: for x in first_item._: if x[0] == 'Nom': if x[2] == 'n': case = 'Nom/Acc' else: case = x[0] break elif x[0] == 'Acc': case = x[0] break else: if not case: case = x[0] # if not case: case = case_n pred.add_nominal(case, word) else: # print "not solved += ", word.surface_utf8() not_solved.append(word) #(ix, word.surface_utf8())) if not_solved: print " NOT SOLVED:" # dump(not_solved, initial_indent=2) # print translate(not_solved) for item in not_solved: dump(item, 4) print " → ", translate(item) print dump(pred) print print " → ", translate(pred) print # 音読モードの場合、読み終わるまでウェイトを入れる if options.speech_mode: speak_latin.pause_while_speaking()