Example #1
0
norm_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_log_ent.model'
# the lsi transformation
trans_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_500__lsi.model'

matrices = {}

logging.info('load the articles pickle')
with open(results_path + "sparql_wiki.pickle", 'r') as f:
    articles = pickle.load(f)

logging.info('load the dictionary')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('load the log_ent model')
log_ent = LogEntropyModel.load(results_path + norm_model)

logging.info('load the LSI model')
lsi = LsiModel.load(results_path + trans_model)

for key in articles.iterkeys():

    logging.info('current term: %s' % key)

    term_list = articles[key].keys()
    text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) 
            for article in articles[key].values()]
    sim_matrix = np.zeros((len(text_list), len(text_list)))

    logging.info('transform the textlist')
    text_list = lsi[log_ent[text_list]]
Example #2
0
# corpus_name     = 'head500.noblanks.cor'
working_corpus = base_path + corpus_path + corpus_name
human_data_file = base_path + "corpora/lee/lee-doc2doc/similarities0-1.txt"
lee_corpus = base_path + "corpora/lee/lee.cor"
result_path = base_path + "results/"

logging.info('loading word mapping')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('loading corpus')
corpus_bow = MmCorpus(working_corpus + '_bow.mm')

logging.info("create log_ent model and save it to disk")
tfidf = LogEntropyModel(corpus_bow,
                        id2word=dictionary.id2token,
                        normalize=True)
tfidf.save(result_path + corpus_name + log_ent_extension)

logging.info('load smal lee corpus and preprocess')
raw_lee_texts = utils.get_txt(lee_corpus)
preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts)
bow_lee_texts = [
    dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False)
    for text in preproc_lee_texts
]

logging.info('initialize LSI model')
lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics)
lsi.save((result_path + corpus_name + '_%i_ent' + lsi_extension) % num_topics)
logging.info('transforming small lee corpus (LSI)')
Example #3
0
# corpus_name     = 'head500.noblanks.cor'
working_corpus  = base_path + corpus_path + corpus_name
human_data_file = base_path + "corpora/lee/lee-doc2doc/similarities0-1.txt"
lee_corpus      = base_path + "corpora/lee/lee.cor"
result_path     = base_path + "results/"


logging.info('loading word mapping')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('loading corpus')
corpus_bow = MmCorpus(working_corpus + '_bow.mm')

logging.info("create log_ent model and save it to disk")
tfidf = LogEntropyModel(corpus_bow, id2word=dictionary.id2token, normalize = True)
tfidf.save(result_path + corpus_name + log_ent_extension)

logging.info('load smal lee corpus and preprocess')
raw_lee_texts = utils.get_txt(lee_corpus)
preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts)
bow_lee_texts = [dictionary.doc2bow(text,
                                    allowUpdate=False,
                                    returnMissingWords=False)
                for text in preproc_lee_texts]

logging.info('initialize LSI model')
lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics)
lsi.save((result_path + corpus_name + '_%i_ent'  + lsi_extension) % num_topics)
logging.info('transforming small lee corpus (LSI)')
corpus_lsi = lsi[tfidf[bow_lee_texts]]
Example #4
0
norm_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_log_ent.model'
# the lsi transformation
trans_model = 'stemmedAllCleaned-fq10-cd10.noblanks.cor_500__lsi.model'

matrices = {}

logging.info('load the articles pickle')
with open(results_path + "sparql_wiki.pickle", 'r') as f:
    articles = pickle.load(f)

logging.info('load the dictionary')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('load the log_ent model')
log_ent = LogEntropyModel.load(results_path + norm_model)

logging.info('load the LSI model')
lsi = LsiModel.load(results_path + trans_model)

for key in articles.iterkeys():

    logging.info('current term: %s' % key)

    term_list = articles[key].keys()
    text_list = [
        dictionary.doc2bow(article['text'],
                           allowUpdate=False,
                           returnMissingWords=False)
        for article in articles[key].values()
    ]