Example #1
0
# corpus_name     = 'head500.noblanks.cor'
working_corpus = base_path + corpus_path + corpus_name
human_data_file = base_path + "corpora/lee/lee-doc2doc/similarities0-1.txt"
lee_corpus = base_path + "corpora/lee/lee.cor"
result_path = base_path + "results/"

logging.info('loading word mapping')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('loading corpus')
corpus_bow = MmCorpus(working_corpus + '_bow.mm')

logging.info("create log_ent model and save it to disk")
tfidf = LogEntropyModel(corpus_bow,
                        id2word=dictionary.id2token,
                        normalize=True)
tfidf.save(result_path + corpus_name + log_ent_extension)

logging.info('load smal lee corpus and preprocess')
raw_lee_texts = utils.get_txt(lee_corpus)
preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts)
bow_lee_texts = [
    dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False)
    for text in preproc_lee_texts
]

logging.info('initialize LSI model')
lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics)
lsi.save((result_path + corpus_name + '_%i_ent' + lsi_extension) % num_topics)
logging.info('transforming small lee corpus (LSI)')