コード例 #1
0
ファイル: latin1.py プロジェクト: naoyat/latin
def repl(options=None, show_prompt=False):
    while True:
        if show_prompt:
            sys.stdout.write("> ")
            sys.stdout.flush()

        line = sys.stdin.readline()
        if not line: break

        text = line.rstrip()
        if options and not options.strict_macron_mode:
            text = char.trans(text)

        # textutil.analyse_text(text, analyse_sentence)
        for sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)):
            analyse_sentence(sentence, options=options)

    if show_prompt:
        print
コード例 #2
0
ファイル: latin1.py プロジェクト: naoyat/latin
def main():
    options = Options(sys.argv[1:])
    if options.speech_mode:
        speak_latin.init_synth('Alex')

    latindic.load(auto_macron_mode=options.auto_macron_mode)

    if len(options.args) == 0:
        # repl mode
        if select.select([sys.stdin,],[],[],0.0)[0]:
            # have data from pipe. no prompt.
            repl(options=options)
        else:
            repl(options=options, show_prompt=True)
    else:
        # file mode
        for file in options.args:
            text = textutil.load_text_from_file(file)
            if options.strict_macron_mode:
                text = char.trans(text)

            # textutil.analyse_text(text, analyse_sentence, options=options)
            for sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)):
                analyse_sentence(sentence, options=options)
コード例 #3
0
ファイル: latin.py プロジェクト: naoyat/latin
def analyse_text(text, options=None):
    # テキストを(句点などで)センテンスに切り分ける
    for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)):
        plain_text = ' '.join(word_surfaces_in_a_sentence)
        if options.echo_on:
            # print plain_text + "\n"
            print "\n" + ansi_color.underline(ansi_color.bold( plain_text )) + "\n"
        if options.speech_mode:
            speak_latin.say_latin(plain_text.decode('utf-8'))

        # unicodeに変換して
        word_surfaces_uc = [word_surface.decode('utf-8', 'strict') for word_surface in word_surfaces_in_a_sentence]
        # 辞書を引いてから
        words = lookup_all(word_surfaces_uc)

        # 先にlookup結果を表示してしまう
        if options and options.show_word_detail:
            print "  --- "
            maxlen_uc = max([0] + [word.surface_len for word in words])
            for i, word in enumerate(words):
                text = word.surface.encode('utf-8')
                print '  %2d  ' % (i) + text + ' '*(maxlen_uc - word.surface_len + 1), word.detail()
            print "  --- "
            print

        # 形容詞/属格の対応
        words, visited_ix = detect_and_or(words)
        words, adj_ix = detect_adj_correspondances(words)
        words, gen_ix = detect_genitive_correspondances(words)
        for ix in visited_ix:
            words[ix] = None
        for ix in adj_ix:
            words[ix] = None
        for ix in gen_ix:
            words[ix] = None
        words = filter(identity, words)

        words, verb_ix = detect_verbs(words)

        words = detect_prep_domination(words)
        words = filter(identity, words)
#        print [word.surface_utf8() for word in words]

        print

        # 名詞句を述語動詞に結びつける
        verbs_ix = []
        verb_count = 0
        for i, word in enumerate(words):
            if isinstance(word, Predicate):
                verbs_ix.append(i)
                verb_count += 1

        verb_surfaces = ', '.join([ansi_color.bold(words[ix].surface_utf8()) for ix in verbs_ix])
        M = len(words)
        groups = []
        if verb_count == 0:
            print ansi_color.underline("NO VERB FOUND.")
            groups.append( range(M) )
        elif verb_count == 1:
            print ansi_color.underline("1 VERB FOUND:") + ' ' + verb_surfaces
            groups.append( range(M) )
        else:
            print ansi_color.underline("%d VERBS FOUND:" % verb_count) + ' ' + verb_surfaces
            groups.append( range(verbs_ix[0]+1) ) # [0..ix0]
            for i in range(1, verb_count-1):
                groups.append( [verbs_ix[i]] )
            groups.append( range(verbs_ix[verb_count-1], M) )
            for i in range(verb_count-1):
                fr = groups[i][-1] + 1
                to = groups[i+1][0] - 1
                if fr == to: continue

                well_divided_at = None
                for j in range(fr, to+1):
                    if words[j].surface == u',':
                        well_divided_at = j
                        break
                if well_divided_at is None:
                    for j in range(fr, to+1):
                        if words[j].surface == u'quod':
                            well_divided_at = j-1
                            break
                if well_divided_at is None:
                    for j in range(fr, to+1):
                        if words[j].surface == u'et':
                            well_divided_at = j-1
                            break
                if well_divided_at is not None:
                    groups[i] += range(fr, well_divided_at+1)
                    groups[i+1] = range(well_divided_at+1, to+1) + groups[i+1]
                else:
                    print "  NOT WELL: {%d..%d}" % (fr, to)
                    # うまく分けられない。とりあえず後の方に入れる
                    groups[i+1] = range(fr, to+1) + groups[i+1]

        print
        for i, group in enumerate(groups):
            if verb_count == 0:
                ws = []
                for word in words:
                    if isinstance(word, Word) and not word.items: continue
                    # ws.append(word)
                    dump(word)
                    print "  → ", translate(word)
                    print # "  --"
                # dump(ws)
                # print translate(ws)
            else:
                not_solved = []
                # words_in_group = [words[ix] for ix in group]
                verb_ix = verbs_ix[i]
                pred = words[verb_ix] # predicate
                for j, ix in enumerate(group):
                    if ix == verb_ix: continue
                    word = words[ix]
                    if isinstance(word, AndOr):
                        pred.add_nominal(word.cases[0], word)
                    elif isinstance(word, PrepClause):
                        pred.add_nominal(word.prep, word)
                    elif isinstance(word, Word):
                        if not word.items: continue
                        first_item = word.items[0]
                        if j == 0 and word.surface in (u'quod', u'ut'):
                            if word.items[1].pos == 'conj':
                                first_item = word.items[1]
                                word.items = word.items[1:]
                        if first_item.pos == 'conj':
                            if j < 2 and not pred.conjunction:
                                pred.conjunction = word
                            else:
                                not_solved.append(word)
                        elif first_item.pos == 'adv':
                            if j < 2 and not pred.conjunction:
                                pred.conjunction = word
                            elif word.surface in (u'ō', u'Ō'):
                                # 二重になってないかチェックする or conjunction を複数取る
                                pred.conjunction = word
                            else:
                                pred.add_modifier(word)
                        elif first_item._:
                            cases = [x[0] for x in first_item._]
                            case = None
                            if 'Voc' in cases and ix > 0 and words[ix-1].surface in (u'ō', u'Ō'):
                                case = 'Voc'
                                # 形的にVocしかありえないケースも拾いたい
                            else:
                                for x in first_item._:
                                    if x[0] == 'Nom':
                                        if x[2] == 'n':
                                            case = 'Nom/Acc'
                                        else:
                                            case = x[0]
                                        break
                                    elif x[0] == 'Acc':
                                        case = x[0]
                                        break
                                    else:
                                        if not case:
                                            case = x[0]

                            # if not case: case = case_n
                            pred.add_nominal(case, word)
                        else:
                            # print "not solved += ", word.surface_utf8()
                            not_solved.append(word) #(ix, word.surface_utf8()))

                if not_solved:
                    print "  NOT SOLVED:"
                    # dump(not_solved, initial_indent=2)
                    # print translate(not_solved)
                    for item in not_solved:
                        dump(item, 4)
                        print "    → ", translate(item)
                        print

                dump(pred)
                print
                print "  → ", translate(pred)
                print

        # 音読モードの場合、読み終わるまでウェイトを入れる
        if options.speech_mode:
            speak_latin.pause_while_speaking()
コード例 #4
0
ファイル: lda_demo.py プロジェクト: naoyat/latin
def main():
    latindic.load(auto_macron_mode=False)


    show_title('original text')

    text = textutil.load_text_from_file('./latin.txt')
    print text[:1000], '...'
    print


    show_title('texts in base-form')

    texts_in_baseform = []
    for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)):
        # print word_surfaces_in_a_sentence

        bases = base_forms_of_words(word_surfaces_in_a_sentence)
        texts_in_baseform.append(bases)

    for sentence in texts_in_baseform[:20]:
        print ' '.join([baseform.encode('utf-8') for baseform in sentence])
    print '...'
    print


    show_title('[gensim] dictionary')

    dictionary = corpora.Dictionary(texts_in_baseform)
    # dictionary.save('/tmp/latintext.dict') # store the dictionary, for future reference
    # print dictionary
    print '{',
    for token, id in dictionary.token2id.items():
        print '\"%s\": %d,' % (token.encode('utf-8'), id),
    print '}'

#    new_doc = "In Crētā īnsulā māgnum labyrinthum Daedalus aedificāvit plēnum viārum flexuōsārum."
#    new_bases = base_forms_of_words(new_doc.split())
#    # print new_bases
#    new_vec = dictionary.doc2bow(new_bases)
#    print new_vec



    show_title('[gensim] corpus')

    corpus = [dictionary.doc2bow(text) for text in texts_in_baseform]
    # corpora.MmCorpus.serialize('/tmp/latintext.mm', corpus)
    # print corpus
    for doc in corpus[:20]:
        print doc
    print '...'
    print



    show_title('tf-idf')  # term frequency * inverse document frequency

    tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
    corpus_tfidf = tfidf[corpus]
    for i, doc in enumerate(corpus_tfidf):
        print doc
        if i == 20: break
    print '...'
    print


    #
    def decode_result(item, delim):
        def translate(token):
            # print "translating \"%s\"..." % token.encode('utf-8')
            items = latindic.lookup(token)
            return items[0]['ja'] if items else '*'
        latin_tokens = re.split(delim, item)[1::2]
        jas = [translate(token) for token in latin_tokens]
        return ' / '.join(jas) # print "\t", items[0]['ja']


    NUM_TOPICS = 80
    TOPICS_TO_TAKE = 10


    show_title('LSI (Latent Semantic Indexing)')

    # initialize an LSI transformation
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPICS)
    # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
    corpus_lsi = lsi[corpus_tfidf]
    topics = lsi.print_topics(TOPICS_TO_TAKE)

    for i, item in enumerate(topics):
        print "%d) %s" % (1+i, item.encode('utf-8'))
        print "    ", decode_result(item, '"')
        print

    print



    show_title('LDA (Latent Dirichlet Allocation)')

    model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS)
    topics = model.show_topics(topics=TOPICS_TO_TAKE)
    for i, item in enumerate(topics):
        print "%d) %s" % (1+i, item.encode('utf-8'))
        print "    ", decode_result(item, ' ?[*+]')
        print

    print