Example #1
0
def analyse_sentence(surfaces, options=None):
    # words: string(utf-8)
    text = ' '.join(surfaces)

    if options.echo_on:
        print ansi_color.ANSI_UNDERLINE_ON + ansi_color.ANSI_BOLD_ON + \
            text + \
            ansi_color.ANSI_BOLD_OFF + ansi_color.ANSI_UNDERLINE_OFF
        print

    if options.speech_mode:
        speak_latin.say_latin(text.decode('utf-8'))


    surfaces_uc = [surface.decode('utf-8') for surface in surfaces]
    # words = [Word(surface, items) for surface, items in lookup_all_words(words_uc)]
    words = lookup_all(surfaces_uc)
    # dump_res(res)
    # util.pp(map(lambda r:r[0], res))

    for i, sentence in enumerate(split_sentence_by_verb(words)):
        # sentence.count_patterns()
        # 前置詞の格支配を利用して絞り込む
        sentence.prep_constraint()

        # sentence.dot('_'.join([word.surface.encode('utf-8') for word in sentence.words]))
        # 属格支配する形容詞
        sentence.genitive_domination()
        # 形容詞などの性・数・格一致を利用して絞り込む
        sentence.modifier_constraint()
#        sentence.count_patterns()
        # 属格がどこにかかるか
        sentence.genitive_constraint()

        if options and options.show_translation:
            sentence.translate()

        if options and options.show_word_detail:
            if options.show_translation:
                print "  ---"
            sentence.dump()

        print

    if options.speech_mode:
        speak_latin.pause_while_speaking()
Example #2
0
def analyse_text(text, options=None):
    # テキストを(句点などで)センテンスに切り分ける
    for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)):
        plain_text = ' '.join(word_surfaces_in_a_sentence)
        if options.echo_on:
            # print plain_text + "\n"
            print "\n" + ansi_color.underline(ansi_color.bold( plain_text )) + "\n"
        if options.speech_mode:
            speak_latin.say_latin(plain_text.decode('utf-8'))

        # unicodeに変換して
        word_surfaces_uc = [word_surface.decode('utf-8', 'strict') for word_surface in word_surfaces_in_a_sentence]
        # 辞書を引いてから
        words = lookup_all(word_surfaces_uc)

        # 先にlookup結果を表示してしまう
        if options and options.show_word_detail:
            print "  --- "
            maxlen_uc = max([0] + [word.surface_len for word in words])
            for i, word in enumerate(words):
                text = word.surface.encode('utf-8')
                print '  %2d  ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1), word.detail()
            print "  --- "
            print

        # 形容詞/属格の対応
        words, visited_ix = detect_and_or(words)
        words, adj_ix = detect_adj_correspondances(words)
        words, gen_ix = detect_genitive_correspondances(words)
        for ix in visited_ix:
            words[ix] = None
        for ix in adj_ix:
            words[ix] = None
        for ix in gen_ix:
            words[ix] = None
        words = filter(identity, words)

        words, verb_ix = detect_verbs(words)

        words = detect_prep_domination(words)
        words = filter(identity, words)
#        print [word.surface_utf8() for word in words]

        print

        # 名詞句を述語動詞に結びつける
        verbs_ix = []
        verb_count = 0
        for i, word in enumerate(words):
            if isinstance(word, Predicate):
                verbs_ix.append(i)
                verb_count += 1

        verb_surfaces = ', '.join([ansi_color.bold(words[ix].surface_utf8()) for ix in verbs_ix])
        M = len(words)
        groups = []
        if verb_count == 0:
            print ansi_color.underline("NO VERB FOUND.")
            groups.append( range(M) )
        elif verb_count == 1:
            print ansi_color.underline("1 VERB FOUND:") + ' ' + verb_surfaces
            groups.append( range(M) )
        else:
            print ansi_color.underline("%d VERBS FOUND:" % verb_count) + ' ' + verb_surfaces
            groups.append( range(verbs_ix[0]+1) ) # [0..ix0]
            for i in range(1, verb_count-1):
                groups.append( [verbs_ix[i]] )
            groups.append( range(verbs_ix[verb_count-1], M) )
            for i in range(verb_count-1):
                fr = groups[i][-1] + 1
                to = groups[i+1][0] - 1
                if fr == to: continue

                well_divided_at = None
                for j in range(fr, to+1):
                    if words[j].surface == u',':
                        well_divided_at = j
                        break
                if well_divided_at is None:
                    for j in range(fr, to+1):
                        if words[j].surface == u'quod':
                            well_divided_at = j-1
                            break
                if well_divided_at is None:
                    for j in range(fr, to+1):
                        if words[j].surface == u'et':
                            well_divided_at = j-1
                            break
                if well_divided_at is not None:
                    groups[i] += range(fr, well_divided_at+1)
                    groups[i+1] = range(well_divided_at+1, to+1) + groups[i+1]
                else:
                    print "  NOT WELL: {%d..%d}" % (fr, to)
                    # うまく分けられない。とりあえず後の方に入れる
                    groups[i+1] = range(fr, to+1) + groups[i+1]

        print
        for i, group in enumerate(groups):
            if verb_count == 0:
                ws = []
                for word in words:
                    if isinstance(word, Word) and not word.items: continue
                    # ws.append(word)
                    dump(word)
                    print "  → ", translate(word)
                    print # "  --"
                # dump(ws)
                # print translate(ws)
            else:
                not_solved = []
                # words_in_group = [words[ix] for ix in group]
                verb_ix = verbs_ix[i]
                pred = words[verb_ix] # predicate
                for j, ix in enumerate(group):
                    if ix == verb_ix: continue
                    word = words[ix]
                    if isinstance(word, AndOr):
                        pred.add_nominal(word.cases[0], word)
                    elif isinstance(word, PrepClause):
                        pred.add_nominal(word.prep, word)
                    elif isinstance(word, Word):
                        if not word.items: continue
                        first_item = word.items[0]
                        if j == 0 and word.surface in (u'quod', u'ut'):
                            if word.items[1].pos == 'conj':
                                first_item = word.items[1]
                                word.items = word.items[1:]
                        if first_item.pos == 'conj':
                            if j < 2 and not pred.conjunction:
                                pred.conjunction = word
                            else:
                                not_solved.append(word)
                        elif first_item.pos == 'adv':
                            if j < 2 and not pred.conjunction:
                                pred.conjunction = word
                            elif word.surface in (u'ō', u'Ō'):
                                # 二重になってないかチェックする or conjunction を複数取る
                                pred.conjunction = word
                            else:
                                pred.add_modifier(word)
                        elif first_item._:
                            cases = [x[0] for x in first_item._]
                            case = None
                            if 'Voc' in cases and ix > 0 and words[ix-1].surface in (u'ō', u'Ō'):
                                case = 'Voc'
                                # 形的にVocしかありえないケースも拾いたい
                            else:
                                for x in first_item._:
                                    if x[0] == 'Nom':
                                        if x[2] == 'n':
                                            case = 'Nom/Acc'
                                        else:
                                            case = x[0]
                                        break
                                    elif x[0] == 'Acc':
                                        case = x[0]
                                        break
                                    else:
                                        if not case:
                                            case = x[0]

                            # if not case: case = case_n
                            pred.add_nominal(case, word)
                        else:
                            # print "not solved += ", word.surface_utf8()
                            not_solved.append(word) #(ix, word.surface_utf8()))

                if not_solved:
                    print "  NOT SOLVED:"
                    # dump(not_solved, initial_indent=2)
                    # print translate(not_solved)
                    for item in not_solved:
                        dump(item, 4)
                        print "    → ", translate(item)
                        print

                dump(pred)
                print
                print "  → ", translate(pred)
                print

        # 音読モードの場合、読み終わるまでウェイトを入れる
        if options.speech_mode:
            speak_latin.pause_while_speaking()