def build_tfidf(self,corpuskb,dictionary): #tfidfkb = models.TfidfModel(corpuskb) #build tfidf model tfidfkb = LogEntropyModel(corpuskb,id2word=dictionary) corpus_tfidf = tfidfkb[corpuskb] #convert all texts into tfidf model tfidfkb.save(self.model_folder('tfidf_model')) return tfidfkb,corpus_tfidf
def logentropybuildspace(searchobject, morphdict, sentences): """ currently unused :param allheadwords: :param morphdict: :param sentences: :return: """ sentences = [[w for w in words.lower().split() if w] for words in sentences if words] sentences = [s for s in sentences if s] bagsofwords = buildwordbags(searchobject, morphdict, sentences) logentropydictionary = corpora.Dictionary(bagsofwords) logentropycorpus = [logentropydictionary.doc2bow(bag) for bag in bagsofwords] logentropyxform = LogEntropyModel(logentropycorpus) lsixform = LsiModel(corpus=logentropycorpus, id2word=logentropydictionary, onepass=False, num_topics=400) corpus = LogEntropyVectorCorpus(lsixform, logentropyxform, logentropydictionary, logentropycorpus, bagsofwords, sentences) return corpus
def create_doc_term_matrix(docs, id2word, tfidf=False, logentropy=False, random_projections=False): doc_term_matrix = [id2word.doc2bow(doc) for doc in docs] _save_model2(doc_term_matrix, 'doc_term_matrix') if random_projections: rp_model = RpModel(corpus=doc_term_matrix, id2word=id2word, num_topics=params['num_topics']) doc_term_matrix = rp_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_random_projections') if tfidf: tfidf_model = TfidfModel(id2word=id2word, corpus=doc_term_matrix, normalize=True) doc_term_matrix = tfidf_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_tfidf') if logentropy: log_model = LogEntropyModel(corpus=doc_term_matrix, normalize=True) doc_term_matrix = log_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_logentropy') return doc_term_matrix
def Pdf2Vec(titles): ''' Vectorizes a given PDF on your local filesystem to a Log Entropy TF-IDF vector to then query against your similarity index Returns: [document-logent-vec-1, document-logent-vec-2, ... ,document-logent-vec-N] where N is is the number of titles ''' #TODO: Make it so you can give a model as an arguement to vecorize a given #document into any trained gensim model ret_lst = [] logent = LogEntropyModel.load('../models/logEntropy.model') diction = Dictionary.load('../models/wiki_dict.dict') for title in titles: curr_file = open('../data/articleData/pdfs/' + title + '.pdf') doc = slate.PDF(curr_file) doc = ' '.join(doc) doc_tokens = wikicorpus.tokenize(doc) bow = diction.doc2bow(doc_tokens) bow_logent = logent[bow] ret_lst.append(bow_logent) curr_file.close() return ret_lst
def calc_similarity(ids, docs, kRandom=3, nClusters=3, sortCluster=True): # ids: list of IDs identifying texts # docs: list of docs dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] # # TF-IDF # tfidf = models.TfidfModel(corpus) # index = similarities.MatrixSimilarity(tfidf[corpus]) # sims = index[tfidf[corpus]] # df_sims = pd.DataFrame(sims, index=ids,columns=ids) # Log ent log_ent = LogEntropyModel(corpus) index = similarities.MatrixSimilarity(log_ent[corpus]) sims = index[log_ent[corpus]] df_sims = pd.DataFrame(sims, index=ids, columns=ids) if sortCluster == True: # Ordered by clusters and distances X = df_sims.copy() model = KMeans(n_clusters=nClusters, random_state=kRandom) # FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. # clusassign = model.fit_predict(X.as_matrix()) # min_dist = np.min(cdist(X.as_matrix(), model.cluster_centers_, 'euclidean'), axis=1) clusassign = model.fit_predict(X.values) min_dist = np.min(cdist(X.values, model.cluster_centers_, 'euclidean'), axis=1) Y = pd.DataFrame(min_dist, index=X.index, columns=['Center_euclidean_dist']) Z = pd.DataFrame(clusassign, index=X.index, columns=['cluster_ID']) A = pd.concat([Y, Z], axis=1) A = A.sort_values(['cluster_ID', 'Center_euclidean_dist']).reset_index() namelist = A['index'].tolist() df_sim_sorted = pd.DataFrame(namelist, columns=['NameSort']) df_sim_sorted = pd.merge(df_sim_sorted, df_sims, left_on='NameSort', right_index=True).set_index('NameSort') df_sim_sorted = df_sim_sorted[namelist] return df_sim_sorted else: return df_sims
def create_corpus(dictionary, documents, is_tfidf=False, smartirs=None, is_log_entropy=False, is_normalize=True): corpus = dictionary.doc2bow(documents, allow_update=False) if is_tfidf: tfidf = TfidfModel(corpus=corpus, smartirs=smartirs) corpus = tfidf[corpus] elif is_log_entropy: log_entropy_model = LogEntropyModel(corpus, normalize=is_normalize) corpus = log_entropy_model[corpus] return corpus
def log_entropy(doc): doc = pre_process_w2v(doc) doc_token = [i.split() for i in doc] dct = Dictionary(doc_token) corpus = [dct.doc2bow(row) for row in doc] model = LogEntropyModel(corpus) vec = [] for row in corpus: vector = model[corpus[row]] x = [] for t in vector: x.append(t[1]) vec.append(x) length = len(sorted(vec, key=len, reverse=True)[0]) vec = np.array([xi + [0] * (length - len(xi)) for xi in vec]) return vec
def create_corpus(dictionary, documents, is_tfidf=False, smartirs=None, is_log_entropy=False, is_normalize=True): corpus = [dictionary.doc2bow(d, allow_update=False) for d in documents] if is_tfidf: tfidf = TfidfModel(corpus=corpus, smartirs=smartirs) corpus = tfidf[corpus] elif is_log_entropy: log_entropy_model = LogEntropyModel(corpus, normalize=is_normalize) corpus = log_entropy_model[corpus] # Will overwritten the existed file # Save new file because the dictionary allow to be updated # dictionary.save(args.dictionary) return corpus
def create_corpus( dictionary, documents, is_tfidf=False, smartirs=None, is_log_entropy=False, is_normalize=True, ): # dictionary = corpora.Dictionary.load(dictionary_path) # corpus = [dictionary.doc2bow(d, allow_update=True) for d in documents] corpus = dictionary.doc2bow(documents, allow_update=False) if is_tfidf: tfidf = TfidfModel(corpus=corpus, smartirs=smartirs) corpus = tfidf[corpus] elif is_log_entropy: log_entropy_model = LogEntropyModel(corpus, normalize=is_normalize) corpus = log_entropy_model[corpus] # dictionary.save(DICTIONARY_PATH) return corpus
wiki_corpus = WikiCorpus(articles) # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs. ### Working with persisted corpus and dictionary bow_corpus = MmCorpus("wiki_corpus.mm") # Revive a corpus dictionary = Dictionary.load("wiki_dict.dict") # Load a dictionary ### Transformations among vector spaces from gensim.models import LsiModel, LogEntropyModel logent_transformation = LogEntropyModel(wiki_corpus, id2word=dictionary) # Log Entropy weights frequencies of all document features in the corpus tokenize_func = wikicorpus.tokenize # The tokenizer used to create the Wikipedia corpus document = "Some text to be transformed." bow_document = dictionary.doc2bow(tokenize_func(document)) # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus. logent_document = logent_transformation[[bow_document]] # converts a single document to log entropy representation. document must be in the same vector space as corpus. documents = ["Some iterable", "containing multiple", "documents", "..."] bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents) # use a generator expression because... logent_documents = logent_transformation[bow_documents] # ...transformation is done during iteration of documents using generators, so this uses constant memory ### Chained transformations logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus], id2word=dictionary) # builds corpus from iterating over documents of bow_corpus as transformed to log entropy representation. Will also take many hours with Wikipedia corpus. lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400) # creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus.
# This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus = WikiCorpus(articles) wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs. ### Working with persisted corpus and dictionary bow_corpus = MmCorpus("wiki_corpus.mm") # Revive a corpus dictionary = Dictionary.load("wiki_dict.dict") # Load a dictionary ### Transformations among vector spaces from gensim.models import LsiModel, LogEntropyModel logent_transformation = LogEntropyModel( wiki_corpus, id2word=dictionary ) # Log Entropy weights frequencies of all document features in the corpus tokenize_func = wikicorpus.tokenize # The tokenizer used to create the Wikipedia corpus document = "Some text to be transformed." # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to # BOW representation using the dictionary created when generating the background corpus. bow_document = dictionary.doc2bow(tokenize_func(document)) # converts a single document to log entropy representation. document must be in the same vector space as corpus. logent_document = logent_transformation[[bow_document]] # Transform arbitrary documents by getting them into the same BOW vector space created by your training corpus documents = ["Some iterable", "containing multiple", "documents", "..."] bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents ) # use a generator expression because...
from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" wiki_corpus = WikiCorpus(articles) wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) bow_corpus = MmCorpus("wiki_corpus.mm") dictionary = Dictionary.load("wiki_dict.dict") from gensim.models import LsiModel, LogEntropyModel logent_transformation = LogEntropyModel(wiki_corpus, id2word=dictionary) tokenize_func = wikicorpus.tokenize document = "Some text to be transformed." bow_document = dictionary.doc2bow(tokenize_func( document)) logent_document = logent_transformation[[ bow_document]] documents = ["Some iterable", "containing multiple", "documents", "..."] bow_documents = (dictionary.doc2bow( tokenize_func(document)) for document in documents) logent_documents = logent_transformation[ bow_documents] logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus])
def _new_model(self, X=None, y=None): return LogEntropyModel(X, normalize=self.normalize)
print('Finished making the wikicorpus, saving BOW corpus\n') corpora.mmcorpus.MmCorpus.serialize('../data/wiki_en_vocab200k', wiki_corpus) print('Done saving BOW Corpus\n') # Save the dicitonary, you will need it to convert future documents into # BOW format #wiki.dictionary.save("../data/wiki_dict.dict") #print 'Saved dictionary' print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models') BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus #log_entropy = LogEntropyModel(BOW_corpus) #log_entropy.save('../models/logEntropy.model') #already provided log_entropy = LogEntropyModel.load('../models/logEntropy.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', log_entropy[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') #tfidf = TfidfModel(BOW_corpus) #tfidf.save('../models/tfidf.model') #already provided tfidf = TfidfModel.load('../models/tfidf.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', tfidf[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') print('Creating Similarity Index') logent_corpus = MmCorpus('../data/log_entropy_matrix')