def main(): options = Options(sys.argv[1:]) if options.speech_mode: speak_latin.init_synth('Alex') latindic.load(auto_macron_mode=options.auto_macron_mode) if len(options.args) == 0: # repl mode if select.select([sys.stdin,],[],[],0.0)[0]: # have data from pipe. no prompt. repl(options=options) else: repl(options=options, show_prompt=True) else: # file mode for file in options.args: text = textutil.load_text_from_file(file) if options and options.capital_to_macron_mode: text = char.trans(text) analyse_text(text, options)
def main(): latindic.load(auto_macron_mode=False) show_title('original text') text = textutil.load_text_from_file('./latin.txt') print text[:1000], '...' print show_title('texts in base-form') texts_in_baseform = [] for word_surfaces_in_a_sentence in textutil.sentence_stream(textutil.word_stream_from_text(text)): # print word_surfaces_in_a_sentence bases = base_forms_of_words(word_surfaces_in_a_sentence) texts_in_baseform.append(bases) for sentence in texts_in_baseform[:20]: print ' '.join([baseform.encode('utf-8') for baseform in sentence]) print '...' print show_title('[gensim] dictionary') dictionary = corpora.Dictionary(texts_in_baseform) # dictionary.save('/tmp/latintext.dict') # store the dictionary, for future reference # print dictionary print '{', for token, id in dictionary.token2id.items(): print '\"%s\": %d,' % (token.encode('utf-8'), id), print '}' # new_doc = "In Crētā īnsulā māgnum labyrinthum Daedalus aedificāvit plēnum viārum flexuōsārum." # new_bases = base_forms_of_words(new_doc.split()) # # print new_bases # new_vec = dictionary.doc2bow(new_bases) # print new_vec show_title('[gensim] corpus') corpus = [dictionary.doc2bow(text) for text in texts_in_baseform] # corpora.MmCorpus.serialize('/tmp/latintext.mm', corpus) # print corpus for doc in corpus[:20]: print doc print '...' print show_title('tf-idf') # term frequency * inverse document frequency tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model corpus_tfidf = tfidf[corpus] for i, doc in enumerate(corpus_tfidf): print doc if i == 20: break print '...' print # def decode_result(item, delim): def translate(token): # print "translating \"%s\"..." % token.encode('utf-8') items = latindic.lookup(token) return items[0]['ja'] if items else '*' latin_tokens = re.split(delim, item)[1::2] jas = [translate(token) for token in latin_tokens] return ' / '.join(jas) # print "\t", items[0]['ja'] NUM_TOPICS = 80 TOPICS_TO_TAKE = 10 show_title('LSI (Latent Semantic Indexing)') # initialize an LSI transformation lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPICS) # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi corpus_lsi = lsi[corpus_tfidf] topics = lsi.print_topics(TOPICS_TO_TAKE) for i, item in enumerate(topics): print "%d) %s" % (1+i, item.encode('utf-8')) print " ", decode_result(item, '"') print print show_title('LDA (Latent Dirichlet Allocation)') model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS) topics = model.show_topics(topics=TOPICS_TO_TAKE) for i, item in enumerate(topics): print "%d) %s" % (1+i, item.encode('utf-8')) print " ", decode_result(item, ' ?[*+]') print print