def Pdf2Vec(titles): ''' Vectorizes a given PDF on your local filesystem to a Log Entropy TF-IDF vector to then query against your similarity index Returns: [document-logent-vec-1, document-logent-vec-2, ... ,document-logent-vec-N] where N is is the number of titles ''' #TODO: Make it so you can give a model as an arguement to vecorize a given #document into any trained gensim model ret_lst = [] logent = LogEntropyModel.load('../models/logEntropy.model') diction = Dictionary.load('../models/wiki_dict.dict') for title in titles: curr_file = open('../data/articleData/pdfs/' + title + '.pdf') doc = slate.PDF(curr_file) doc = ' '.join(doc) doc_tokens = wikicorpus.tokenize(doc) bow = diction.doc2bow(doc_tokens) bow_logent = logent[bow] ret_lst.append(bow_logent) curr_file.close() return ret_lst
print('Finished making the wikicorpus, saving BOW corpus\n') corpora.mmcorpus.MmCorpus.serialize('../data/wiki_en_vocab200k', wiki_corpus) print('Done saving BOW Corpus\n') # Save the dicitonary, you will need it to convert future documents into # BOW format #wiki.dictionary.save("../data/wiki_dict.dict") #print 'Saved dictionary' print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models') BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus #log_entropy = LogEntropyModel(BOW_corpus) #log_entropy.save('../models/logEntropy.model') #already provided log_entropy = LogEntropyModel.load('../models/logEntropy.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', log_entropy[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') #tfidf = TfidfModel(BOW_corpus) #tfidf.save('../models/tfidf.model') #already provided tfidf = TfidfModel.load('../models/tfidf.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', tfidf[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') print('Creating Similarity Index') logent_corpus = MmCorpus('../data/log_entropy_matrix')