Ejemplo n.º 1
0
def main():
    options = Options(sys.argv[1:])
    if options.speech_mode:
        speak_latin.init_synth('Alex')

    latindic.load(auto_macron_mode=options.auto_macron_mode)

    if len(options.args) == 0:
        # repl mode
        if select.select([sys.stdin,],[],[],0.0)[0]:
            # have data from pipe. no prompt.
            repl(options=options)
        else:
            repl(options=options, show_prompt=True)
    else:
        # file mode
        for file in options.args:
            text = textutil.load_text_from_file(file)
            if options and options.capital_to_macron_mode:
                text = char.trans(text)

            analyse_text(text, options)
Ejemplo n.º 2
0
def main():
    latindic.load(auto_macron_mode=False)


    show_title('original text')

    text = textutil.load_text_from_file('./latin.txt')
    print text[:1000], '...'
    print


    show_title('texts in base-form')

    texts_in_baseform = []
    for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)):
        # print word_surfaces_in_a_sentence

        bases = base_forms_of_words(word_surfaces_in_a_sentence)
        texts_in_baseform.append(bases)

    for sentence in texts_in_baseform[:20]:
        print ' '.join([baseform.encode('utf-8') for baseform in sentence])
    print '...'
    print


    show_title('[gensim] dictionary')

    dictionary = corpora.Dictionary(texts_in_baseform)
    # dictionary.save('/tmp/latintext.dict') # store the dictionary, for future reference
    # print dictionary
    print '{',
    for token, id in dictionary.token2id.items():
        print '\"%s\": %d,' % (token.encode('utf-8'), id),
    print '}'

#    new_doc = "In Crētā īnsulā māgnum labyrinthum Daedalus aedificāvit plēnum viārum flexuōsārum."
#    new_bases = base_forms_of_words(new_doc.split())
#    # print new_bases
#    new_vec = dictionary.doc2bow(new_bases)
#    print new_vec



    show_title('[gensim] corpus')

    corpus = [dictionary.doc2bow(text) for text in texts_in_baseform]
    # corpora.MmCorpus.serialize('/tmp/latintext.mm', corpus)
    # print corpus
    for doc in corpus[:20]:
        print doc
    print '...'
    print



    show_title('tf-idf')  # term frequency * inverse document frequency

    tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
    corpus_tfidf = tfidf[corpus]
    for i, doc in enumerate(corpus_tfidf):
        print doc
        if i == 20: break
    print '...'
    print


    #
    def decode_result(item, delim):
        def translate(token):
            # print "translating \"%s\"..." % token.encode('utf-8')
            items = latindic.lookup(token)
            return items[0]['ja'] if items else '*'
        latin_tokens = re.split(delim, item)[1::2]
        jas = [translate(token) for token in latin_tokens]
        return ' / '.join(jas) # print "\t", items[0]['ja']


    NUM_TOPICS = 80
    TOPICS_TO_TAKE = 10


    show_title('LSI (Latent Semantic Indexing)')

    # initialize an LSI transformation
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPICS)
    # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
    corpus_lsi = lsi[corpus_tfidf]
    topics = lsi.print_topics(TOPICS_TO_TAKE)

    for i, item in enumerate(topics):
        print "%d) %s" % (1+i, item.encode('utf-8'))
        print "    ", decode_result(item, '"')
        print

    print



    show_title('LDA (Latent Dirichlet Allocation)')

    model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS)
    topics = model.show_topics(topics=TOPICS_TO_TAKE)
    for i, item in enumerate(topics):
        print "%d) %s" % (1+i, item.encode('utf-8'))
        print "    ", decode_result(item, ' ?[*+]')
        print

    print