def build_tfidf(self,corpuskb,dictionary):
        #tfidfkb = models.TfidfModel(corpuskb) #build tfidf model
        tfidfkb = LogEntropyModel(corpuskb,id2word=dictionary)

        corpus_tfidf = tfidfkb[corpuskb] #convert all texts into tfidf model

        tfidfkb.save(self.model_folder('tfidf_model'))
        return tfidfkb,corpus_tfidf
def logentropybuildspace(searchobject, morphdict, sentences):
	"""

	currently unused

	:param allheadwords:
	:param morphdict:
	:param sentences:
	:return:
	"""

	sentences = [[w for w in words.lower().split() if w] for words in sentences if words]
	sentences = [s for s in sentences if s]

	bagsofwords = buildwordbags(searchobject, morphdict, sentences)

	logentropydictionary = corpora.Dictionary(bagsofwords)
	logentropycorpus = [logentropydictionary.doc2bow(bag) for bag in bagsofwords]
	logentropyxform = LogEntropyModel(logentropycorpus)
	lsixform = LsiModel(corpus=logentropycorpus,
						id2word=logentropydictionary,
						onepass=False,
						num_topics=400)

	corpus = LogEntropyVectorCorpus(lsixform, logentropyxform, logentropydictionary, logentropycorpus, bagsofwords, sentences)

	return corpus
Exemple #3
0
def create_doc_term_matrix(docs,
                           id2word,
                           tfidf=False,
                           logentropy=False,
                           random_projections=False):
    doc_term_matrix = [id2word.doc2bow(doc) for doc in docs]
    _save_model2(doc_term_matrix, 'doc_term_matrix')

    if random_projections:
        rp_model = RpModel(corpus=doc_term_matrix,
                           id2word=id2word,
                           num_topics=params['num_topics'])
        doc_term_matrix = rp_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_random_projections')

    if tfidf:
        tfidf_model = TfidfModel(id2word=id2word,
                                 corpus=doc_term_matrix,
                                 normalize=True)
        doc_term_matrix = tfidf_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_tfidf')

    if logentropy:
        log_model = LogEntropyModel(corpus=doc_term_matrix, normalize=True)
        doc_term_matrix = log_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_logentropy')

    return doc_term_matrix
Exemple #4
0
def Pdf2Vec(titles):
    '''
	Vectorizes a given PDF on your local filesystem to a Log Entropy TF-IDF
	vector to then query against your similarity index

	Returns:

	[document-logent-vec-1, document-logent-vec-2, ... ,document-logent-vec-N]
	where N is is the number of titles
	'''
    #TODO: Make it so you can give a model as an arguement to vecorize a given
    #document into any trained gensim model

    ret_lst = []
    logent = LogEntropyModel.load('../models/logEntropy.model')
    diction = Dictionary.load('../models/wiki_dict.dict')
    for title in titles:
        curr_file = open('../data/articleData/pdfs/' + title + '.pdf')
        doc = slate.PDF(curr_file)
        doc = ' '.join(doc)
        doc_tokens = wikicorpus.tokenize(doc)
        bow = diction.doc2bow(doc_tokens)
        bow_logent = logent[bow]
        ret_lst.append(bow_logent)
        curr_file.close()

    return ret_lst
Exemple #5
0
def calc_similarity(ids, docs, kRandom=3, nClusters=3, sortCluster=True):
    # ids: list of IDs identifying texts
    # docs: list of docs

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # # TF-IDF
    # tfidf = models.TfidfModel(corpus)
    # index = similarities.MatrixSimilarity(tfidf[corpus])
    # sims = index[tfidf[corpus]]
    # df_sims = pd.DataFrame(sims, index=ids,columns=ids)

    # Log ent
    log_ent = LogEntropyModel(corpus)
    index = similarities.MatrixSimilarity(log_ent[corpus])
    sims = index[log_ent[corpus]]
    df_sims = pd.DataFrame(sims, index=ids, columns=ids)

    if sortCluster == True:
        # Ordered by clusters and distances
        X = df_sims.copy()
        model = KMeans(n_clusters=nClusters, random_state=kRandom)

        #     FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
        #     clusassign = model.fit_predict(X.as_matrix())
        #     min_dist = np.min(cdist(X.as_matrix(), model.cluster_centers_, 'euclidean'), axis=1)

        clusassign = model.fit_predict(X.values)
        min_dist = np.min(cdist(X.values, model.cluster_centers_, 'euclidean'),
                          axis=1)

        Y = pd.DataFrame(min_dist,
                         index=X.index,
                         columns=['Center_euclidean_dist'])
        Z = pd.DataFrame(clusassign, index=X.index, columns=['cluster_ID'])
        A = pd.concat([Y, Z], axis=1)
        A = A.sort_values(['cluster_ID',
                           'Center_euclidean_dist']).reset_index()

        namelist = A['index'].tolist()
        df_sim_sorted = pd.DataFrame(namelist, columns=['NameSort'])
        df_sim_sorted = pd.merge(df_sim_sorted,
                                 df_sims,
                                 left_on='NameSort',
                                 right_index=True).set_index('NameSort')
        df_sim_sorted = df_sim_sorted[namelist]

        return df_sim_sorted
    else:
        return df_sims
def create_corpus(dictionary,
                  documents,
                  is_tfidf=False,
                  smartirs=None,
                  is_log_entropy=False,
                  is_normalize=True):
    corpus = dictionary.doc2bow(documents, allow_update=False)
    if is_tfidf:
        tfidf = TfidfModel(corpus=corpus, smartirs=smartirs)
        corpus = tfidf[corpus]
    elif is_log_entropy:
        log_entropy_model = LogEntropyModel(corpus, normalize=is_normalize)
        corpus = log_entropy_model[corpus]
    return corpus
def log_entropy(doc):
    doc = pre_process_w2v(doc)
    doc_token = [i.split() for i in doc]
    dct = Dictionary(doc_token)
    corpus = [dct.doc2bow(row) for row in doc]
    model = LogEntropyModel(corpus)
    vec = []
    for row in corpus:
        vector = model[corpus[row]]
        x = []
        for t in vector:
            x.append(t[1])
        vec.append(x)

    length = len(sorted(vec, key=len, reverse=True)[0])
    vec = np.array([xi + [0] * (length - len(xi)) for xi in vec])
    return vec
def create_corpus(dictionary,
                  documents,
                  is_tfidf=False,
                  smartirs=None,
                  is_log_entropy=False,
                  is_normalize=True):

    corpus = [dictionary.doc2bow(d, allow_update=False) for d in documents]
    if is_tfidf:
        tfidf = TfidfModel(corpus=corpus, smartirs=smartirs)
        corpus = tfidf[corpus]
    elif is_log_entropy:
        log_entropy_model = LogEntropyModel(corpus, normalize=is_normalize)
        corpus = log_entropy_model[corpus]

    # Will overwritten the existed file
    # Save new file because the dictionary allow to be updated
    # dictionary.save(args.dictionary)

    return corpus
Exemple #9
0
def create_corpus(
    dictionary,
    documents,
    is_tfidf=False,
    smartirs=None,
    is_log_entropy=False,
    is_normalize=True,
):

    # dictionary = corpora.Dictionary.load(dictionary_path)
    # corpus = [dictionary.doc2bow(d, allow_update=True) for d in documents]
    corpus = dictionary.doc2bow(documents, allow_update=False)
    if is_tfidf:
        tfidf = TfidfModel(corpus=corpus, smartirs=smartirs)
        corpus = tfidf[corpus]
    elif is_log_entropy:
        log_entropy_model = LogEntropyModel(corpus, normalize=is_normalize)
        corpus = log_entropy_model[corpus]
    # dictionary.save(DICTIONARY_PATH)
    return corpus
wiki_corpus = WikiCorpus(articles)  # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.


### Working with persisted corpus and dictionary
bow_corpus = MmCorpus("wiki_corpus.mm")  # Revive a corpus

dictionary = Dictionary.load("wiki_dict.dict")  # Load a dictionary


### Transformations among vector spaces
from gensim.models import LsiModel, LogEntropyModel

logent_transformation = LogEntropyModel(wiki_corpus, id2word=dictionary)  # Log Entropy weights frequencies of all document features in the corpus

tokenize_func = wikicorpus.tokenize  # The tokenizer used to create the Wikipedia corpus
document = "Some text to be transformed."
bow_document = dictionary.doc2bow(tokenize_func(document))  # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus.
logent_document = logent_transformation[[bow_document]]  # converts a single document to log entropy representation. document must be in the same vector space as corpus.

documents = ["Some iterable", "containing multiple", "documents", "..."]
bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents)  # use a generator expression because...
logent_documents = logent_transformation[bow_documents]  # ...transformation is done during iteration of documents using generators, so this uses constant memory

### Chained transformations
logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus], id2word=dictionary)  # builds corpus from iterating over documents of bow_corpus as transformed to log entropy representation. Will also take many hours with Wikipedia corpus.

lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400)  # creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus.
Exemple #11
0
# This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus = WikiCorpus(articles)
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.

### Working with persisted corpus and dictionary
bow_corpus = MmCorpus("wiki_corpus.mm")  # Revive a corpus

dictionary = Dictionary.load("wiki_dict.dict")  # Load a dictionary

### Transformations among vector spaces
from gensim.models import LsiModel, LogEntropyModel

logent_transformation = LogEntropyModel(
    wiki_corpus, id2word=dictionary
)  # Log Entropy weights frequencies of all document features in the corpus

tokenize_func = wikicorpus.tokenize  # The tokenizer used to create the Wikipedia corpus
document = "Some text to be transformed."
# First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to
# BOW representation using the dictionary created when generating the background corpus.
bow_document = dictionary.doc2bow(tokenize_func(document))
# converts a single document to log entropy representation. document must be in the same vector space as corpus.
logent_document = logent_transformation[[bow_document]]

# Transform arbitrary documents by getting them into the same BOW vector space created by your training corpus
documents = ["Some iterable", "containing multiple", "documents", "..."]
bow_documents = (dictionary.doc2bow(tokenize_func(document))
                 for document in documents
                 )  # use a generator expression because...
from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2" 

wiki_corpus = WikiCorpus(articles)
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) 

bow_corpus = MmCorpus("wiki_corpus.mm") 

dictionary = Dictionary.load("wiki_dict.dict")  

from gensim.models import LsiModel, LogEntropyModel

logent_transformation = LogEntropyModel(wiki_corpus,
    id2word=dictionary) 

tokenize_func = wikicorpus.tokenize  
document = "Some text to be transformed."

bow_document = dictionary.doc2bow(tokenize_func(
    document))
logent_document = logent_transformation[[
    bow_document]]

documents = ["Some iterable", "containing multiple", "documents", "..."]
bow_documents = (dictionary.doc2bow(
    tokenize_func(document)) for document in documents) 
logent_documents = logent_transformation[
                   bow_documents] 
logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus])
Exemple #13
0
 def _new_model(self, X=None, y=None):
     return LogEntropyModel(X, normalize=self.normalize)
Exemple #14
0
print('Finished making the wikicorpus, saving BOW corpus\n')
corpora.mmcorpus.MmCorpus.serialize('../data/wiki_en_vocab200k', wiki_corpus)
print('Done saving BOW Corpus\n')

# Save the dicitonary, you will need it to convert future documents into
# BOW format

#wiki.dictionary.save("../data/wiki_dict.dict")
#print 'Saved dictionary'

print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models')
BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus

#log_entropy = LogEntropyModel(BOW_corpus)
#log_entropy.save('../models/logEntropy.model') #already provided
log_entropy = LogEntropyModel.load('../models/logEntropy.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
log_entropy[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

#tfidf = TfidfModel(BOW_corpus)
#tfidf.save('../models/tfidf.model') #already provided
tfidf = TfidfModel.load('../models/tfidf.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
tfidf[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

print('Creating Similarity Index')
logent_corpus = MmCorpus('../data/log_entropy_matrix')