norm_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_log_ent.model' # the lsi transformation trans_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_500__lsi.model' matrices = {} logging.info('load the articles pickle') with open(results_path + "sparql_wiki.pickle", 'r') as f: articles = pickle.load(f) logging.info('load the dictionary') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('load the log_ent model') log_ent = LogEntropyModel.load(results_path + norm_model) logging.info('load the LSI model') lsi = LsiModel.load(results_path + trans_model) for key in articles.iterkeys(): logging.info('current term: %s' % key) term_list = articles[key].keys() text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) for article in articles[key].values()] sim_matrix = np.zeros((len(text_list), len(text_list))) logging.info('transform the textlist') text_list = lsi[log_ent[text_list]]
# corpus_name = 'head500.noblanks.cor' working_corpus = base_path + corpus_path + corpus_name human_data_file = base_path + "corpora/lee/lee-doc2doc/similarities0-1.txt" lee_corpus = base_path + "corpora/lee/lee.cor" result_path = base_path + "results/" logging.info('loading word mapping') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('loading corpus') corpus_bow = MmCorpus(working_corpus + '_bow.mm') logging.info("create log_ent model and save it to disk") tfidf = LogEntropyModel(corpus_bow, id2word=dictionary.id2token, normalize=True) tfidf.save(result_path + corpus_name + log_ent_extension) logging.info('load smal lee corpus and preprocess') raw_lee_texts = utils.get_txt(lee_corpus) preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts) bow_lee_texts = [ dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False) for text in preproc_lee_texts ] logging.info('initialize LSI model') lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics) lsi.save((result_path + corpus_name + '_%i_ent' + lsi_extension) % num_topics) logging.info('transforming small lee corpus (LSI)')
# corpus_name = 'head500.noblanks.cor' working_corpus = base_path + corpus_path + corpus_name human_data_file = base_path + "corpora/lee/lee-doc2doc/similarities0-1.txt" lee_corpus = base_path + "corpora/lee/lee.cor" result_path = base_path + "results/" logging.info('loading word mapping') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('loading corpus') corpus_bow = MmCorpus(working_corpus + '_bow.mm') logging.info("create log_ent model and save it to disk") tfidf = LogEntropyModel(corpus_bow, id2word=dictionary.id2token, normalize = True) tfidf.save(result_path + corpus_name + log_ent_extension) logging.info('load smal lee corpus and preprocess') raw_lee_texts = utils.get_txt(lee_corpus) preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts) bow_lee_texts = [dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False) for text in preproc_lee_texts] logging.info('initialize LSI model') lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics) lsi.save((result_path + corpus_name + '_%i_ent' + lsi_extension) % num_topics) logging.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[tfidf[bow_lee_texts]]
norm_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_log_ent.model' # the lsi transformation trans_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_500__lsi.model' matrices = {} logging.info('load the articles pickle') with open(results_path + "sparql_wiki.pickle", 'r') as f: articles = pickle.load(f) logging.info('load the dictionary') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('load the log_ent model') log_ent = LogEntropyModel.load(results_path + norm_model) logging.info('load the LSI model') lsi = LsiModel.load(results_path + trans_model) for key in articles.iterkeys(): logging.info('current term: %s' % key) term_list = articles[key].keys() text_list = [ dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) for article in articles[key].values() ]