Ejemplo n.º 1
0
def render_with_indent(indent, obj):
        if isinstance(obj, Sentence):
            for word in obj.words:
                render_with_indent(indent+2, word)

        elif isinstance(obj, AndOr):
            print ' '*indent + '[' + obj.and_or_word.encode('utf-8') + '] ' # + str(obj._)
            for words in obj.words_slots:
                render_with_indent(indent+2, words[0])

        elif isinstance(obj, PrepClause):
            # print ' '*indent + obj.item.surface.encode('utf-8') + ' ' + obj.item.ja + ' <'+ obj.dominated_case + '>'
            print ' '*indent + obj.item.surface.encode('utf-8') + ' <'+ obj.dominated_case + '>'
            for word in obj.words:
                render_with_indent(indent+2, word)

        elif isinstance(obj, Word):
            if not obj.items: return

            text = decolate(obj)
            if obj.items[0].pos in ('conj', 'adv'):
                text = '(' + text + ')'
            print ' '*indent + text # word.surface.encode('utf-8')
            for gen in obj.genitives:
                render_with_indent(indent+2, gen)
            for mod in obj.modifiers:
                render_with_indent(indent+2, mod)
#                print '    ' + mod.surface.encode('utf-8')

        elif isinstance(obj, Predicate):
            text = obj.surface.encode('utf-8')
            text = ansi_color.underline(ansi_color.bold(text, ansi_color.RED))
            print ' '*indent + text, "(%s %s%s)" % (obj.mood(), str(obj.person()), obj.number())
            if obj.conjunction:
                render_with_indent(indent+2, obj.conjunction)
            for mod in obj.modifiers:
                render_with_indent(indent+2, mod)
            for case, objs in obj.case_slot.items():
                if isinstance(case, unicode):
                    print ' '*(indent+2) + "prep:"
                else:
                    print ' '*(indent+2) + case + ":"
                for obj in objs:
                    render_with_indent(indent+4, obj)

        elif isinstance(obj, list):
            for item in obj:
                render_with_indent(indent+2, item)
Ejemplo n.º 2
0
    def dump(self):
        # (表示用に)単語の最大長を得ておく
        maxlen_uc = max([0] + [word.surface_len for word in self.words])

        for i, word in enumerate(self.words):
            is_verb = False
            if word.is_verb():
                color = ansi_color.RED
                is_verb = True
#            if verb_count == 0:
#                st['predicate'] = item
#        elif items and any([item['pos'] in ['noun','pronoun'] for item in items]):
            elif word.has_subst_case('Nom'):
                color = ansi_color.BLUE
            elif word.has_subst_case('Acc'):
                color = ansi_color.BLACK
            elif word.has_subst_case('Gen'):
                color = ansi_color.GREEN
            elif word.has_subst_case('Abl'):
                color = ansi_color.YELLOW
            elif word.has_subst_case('Dat'):
                color = ansi_color.MAGENTA
            else:
                color = None # ansi_color.DEFAULT

            text = word.surface.encode('utf-8')
#1        print "/%s/ %s" % (text, str(color))
        # text = (u'%*s' % (-maxlen_uc, surface)).encode('utf-8')
            if color is not None:
                text = ansi_color.bold(text, color)
            if is_verb:
                text = ansi_color.underline(text)

            print '  %2d  ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1),

            if word.items is None:
                print
            elif word.items == []:
                print '(?)'
            else:
                print ' | '.join([item.description() for item in word.items])
        print
Ejemplo n.º 3
0
def analyse_text(text, options=None):
    # テキストを(句点などで)センテンスに切り分ける
    for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)):
        plain_text = ' '.join(word_surfaces_in_a_sentence)
        if options.echo_on:
            # print plain_text + "\n"
            print "\n" + ansi_color.underline(ansi_color.bold( plain_text )) + "\n"
        if options.speech_mode:
            speak_latin.say_latin(plain_text.decode('utf-8'))

        # unicodeに変換して
        word_surfaces_uc = [word_surface.decode('utf-8', 'strict') for word_surface in word_surfaces_in_a_sentence]
        # 辞書を引いてから
        words = lookup_all(word_surfaces_uc)

        # 先にlookup結果を表示してしまう
        if options and options.show_word_detail:
            print "  --- "
            maxlen_uc = max([0] + [word.surface_len for word in words])
            for i, word in enumerate(words):
                text = word.surface.encode('utf-8')
                print '  %2d  ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1), word.detail()
            print "  --- "
            print

        # 形容詞/属格の対応
        words, visited_ix = detect_and_or(words)
        words, adj_ix = detect_adj_correspondances(words)
        words, gen_ix = detect_genitive_correspondances(words)
        for ix in visited_ix:
            words[ix] = None
        for ix in adj_ix:
            words[ix] = None
        for ix in gen_ix:
            words[ix] = None
        words = filter(identity, words)

        words, verb_ix = detect_verbs(words)

        words = detect_prep_domination(words)
        words = filter(identity, words)
#        print [word.surface_utf8() for word in words]

        print

        # 名詞句を述語動詞に結びつける
        verbs_ix = []
        verb_count = 0
        for i, word in enumerate(words):
            if isinstance(word, Predicate):
                verbs_ix.append(i)
                verb_count += 1

        verb_surfaces = ', '.join([ansi_color.bold(words[ix].surface_utf8()) for ix in verbs_ix])
        M = len(words)
        groups = []
        if verb_count == 0:
            print ansi_color.underline("NO VERB FOUND.")
            groups.append( range(M) )
        elif verb_count == 1:
            print ansi_color.underline("1 VERB FOUND:") + ' ' + verb_surfaces
            groups.append( range(M) )
        else:
            print ansi_color.underline("%d VERBS FOUND:" % verb_count) + ' ' + verb_surfaces
            groups.append( range(verbs_ix[0]+1) ) # [0..ix0]
            for i in range(1, verb_count-1):
                groups.append( [verbs_ix[i]] )
            groups.append( range(verbs_ix[verb_count-1], M) )
            for i in range(verb_count-1):
                fr = groups[i][-1] + 1
                to = groups[i+1][0] - 1
                if fr == to: continue

                well_divided_at = None
                for j in range(fr, to+1):
                    if words[j].surface == u',':
                        well_divided_at = j
                        break
                if well_divided_at is None:
                    for j in range(fr, to+1):
                        if words[j].surface == u'quod':
                            well_divided_at = j-1
                            break
                if well_divided_at is None:
                    for j in range(fr, to+1):
                        if words[j].surface == u'et':
                            well_divided_at = j-1
                            break
                if well_divided_at is not None:
                    groups[i] += range(fr, well_divided_at+1)
                    groups[i+1] = range(well_divided_at+1, to+1) + groups[i+1]
                else:
                    print "  NOT WELL: {%d..%d}" % (fr, to)
                    # うまく分けられない。とりあえず後の方に入れる
                    groups[i+1] = range(fr, to+1) + groups[i+1]

        print
        for i, group in enumerate(groups):
            if verb_count == 0:
                ws = []
                for word in words:
                    if isinstance(word, Word) and not word.items: continue
                    # ws.append(word)
                    dump(word)
                    print "  → ", translate(word)
                    print # "  --"
                # dump(ws)
                # print translate(ws)
            else:
                not_solved = []
                # words_in_group = [words[ix] for ix in group]
                verb_ix = verbs_ix[i]
                pred = words[verb_ix] # predicate
                for j, ix in enumerate(group):
                    if ix == verb_ix: continue
                    word = words[ix]
                    if isinstance(word, AndOr):
                        pred.add_nominal(word.cases[0], word)
                    elif isinstance(word, PrepClause):
                        pred.add_nominal(word.prep, word)
                    elif isinstance(word, Word):
                        if not word.items: continue
                        first_item = word.items[0]
                        if j == 0 and word.surface in (u'quod', u'ut'):
                            if word.items[1].pos == 'conj':
                                first_item = word.items[1]
                                word.items = word.items[1:]
                        if first_item.pos == 'conj':
                            if j < 2 and not pred.conjunction:
                                pred.conjunction = word
                            else:
                                not_solved.append(word)
                        elif first_item.pos == 'adv':
                            if j < 2 and not pred.conjunction:
                                pred.conjunction = word
                            elif word.surface in (u'ō', u'Ō'):
                                # 二重になってないかチェックする or conjunction を複数取る
                                pred.conjunction = word
                            else:
                                pred.add_modifier(word)
                        elif first_item._:
                            cases = [x[0] for x in first_item._]
                            case = None
                            if 'Voc' in cases and ix > 0 and words[ix-1].surface in (u'ō', u'Ō'):
                                case = 'Voc'
                                # 形的にVocしかありえないケースも拾いたい
                            else:
                                for x in first_item._:
                                    if x[0] == 'Nom':
                                        if x[2] == 'n':
                                            case = 'Nom/Acc'
                                        else:
                                            case = x[0]
                                        break
                                    elif x[0] == 'Acc':
                                        case = x[0]
                                        break
                                    else:
                                        if not case:
                                            case = x[0]

                            # if not case: case = case_n
                            pred.add_nominal(case, word)
                        else:
                            # print "not solved += ", word.surface_utf8()
                            not_solved.append(word) #(ix, word.surface_utf8()))

                if not_solved:
                    print "  NOT SOLVED:"
                    # dump(not_solved, initial_indent=2)
                    # print translate(not_solved)
                    for item in not_solved:
                        dump(item, 4)
                        print "    → ", translate(item)
                        print

                dump(pred)
                print
                print "  → ", translate(pred)
                print

        # 音読モードの場合、読み終わるまでウェイトを入れる
        if options.speech_mode:
            speak_latin.pause_while_speaking()