def __init__(self, corpus, dct, df): self.dct = dct self.corpus = corpus self.model = HdpModel(corpus, dct) self.df = df self.lda = None self.topic_dist = None
def doHDP(parent, directory): # requires that dictionary and corpus file already exist files = os.listdir(parent + directory) dictionary = corpora.Dictionary.load("../processed/" + parent[9:-1] + "_lda/" + directory + ".dict") review_corpus = corpora.MmCorpus("../processed/" + parent[9:-1] + "_lda/" + directory + ".mm") numTopics = [3, 5, 10] for numTopic in numTopics: print "Running HDP for", directory, "for", numTopic, "topics\n" hdp = HdpModel(corpus=review_corpus, id2word=dictionary, T=numTopic, K=10, gamma = 0.8, alpha = 1) hdp.save("../processed/" + parent[9:-1] + "_hdp/" + directory + "_" + str(numTopic) + "_topicModel.hdp") showHDPTopics(hdp)
def hldaperplexity(train, test): corpus = gensim.matutils.Dense2Corpus(train.astype(int), documents_columns=False) corpusTest = gensim.matutils.Dense2Corpus(test.astype(int), documents_columns=False) dictionary = Dictionary.from_corpus(corpus) with warnings.catch_warnings(): warnings.simplefilter("ignore") c = Chrono().start() hlda = HdpModel(corpus, dictionary) c.end() corpus_words = sum(cnt for document in corpusTest for _, cnt in document) with warnings.catch_warnings(): warnings.simplefilter("ignore") ll = hlda.evaluate_test_corpus(corpusTest) perwordbound = ll / corpus_words print( "LDA %.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" % (perwordbound, numpy.exp2(-perwordbound), len(corpusTest), corpus_words)) return numpy.exp2(-perwordbound), c.elapsed()
class HDP(object): def __init__(self, corpus, dct, df): self.dct = dct self.corpus = corpus self.model = HdpModel(corpus, dct) self.df = df self.lda = None self.topic_dist = None def build_lda(self): self.lda = self.model.suggested_lda_model() def build_topic_dist(self): self.topic_dist = [] for lst in self.lda[self.corpus]: distr = np.array([0.0] * 150) for tup in lst: distr[tup[0]] = tup[1] self.topic_dist.append(distr) def jensen_shannon(self, query, matrix): p = query q = matrix m = 0.5 * (p + q) E1 = entropy(p, m) E2 = entropy(q, m) E = E1 + E2 return np.sqrt(0.5 * E) def similarity(self, query, matrix, k=10): sims = [] for index, item in enumerate(matrix): sims.append(self.jensen_shannon(query, matrix[index])) sims = np.array(sims) return sims.argsort()[:k] def similarity_query(self, index, k=10, n=2): bow = self.dct.doc2bow(self.df.iloc[index, n]) doc_distribution = np.array([0.0] * 150) for tup in self.lda.get_document_topics(bow=bow): doc_distribution[tup[0]] = tup[1] return self.similarity(doc_distribution, self.topic_dist, k)
# # Induce a hierarchy from the probabilistic generative model fit to the corpus # import numpy as np from gensim.corpora import Dictionary from gensim.models.hdpmodel import HdpModel from collections import OrderedDict from scipy.stats import dirichlet import edmonds import pickle out = '/home/mjg/data/descriptions' # Load model dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2') hda = HdpModel.load(out + ".hdm") topics = hda.show_topics(-1, -1, formatted=False) def mkarray(l): a = np.array([x for x in l]) return a # Calculate corpus probabilities alpha = 0.1 * len(dictionary) phi = [] for i in range(len(topics)): phic = {} for j in range(len(topics[i][1])): phic[topics[i][1][j][0]] = topics[i][1][j][1] phi.append(OrderedDict(sorted(phic.items())))
kfolds=10 kf = cross_validation.KFold(count1, n_folds=kfolds) for li in f: li=li.split() corpora_documents.append(li) for la in f2: la=la.split() label_level.append(la) corpora_documents=array(corpora_documents) label_level=array(label_level) #生成字典和向量语料 dictionary = corpora.Dictionary(corpora_documents) #dictionary.save('dictionary.dict') corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf=models.TfidfModel(corpus) corpus_tfidf=tfidf[corpus] hdp=HdpModel(corpus_tfidf,id2word=dictionary) corpus_hdp=hdp[corpus_tfidf] index=similarities.MatrixSimilarity(corpus_hdp) print(hdp.print_topics(num_topics=20, num_words=10))
print(lsi[test_doc_bow]) test_doc2 = 'Material and physics' test_doc2 = custom_preprocess(test_doc2) test_doc_bow2 = journals_dictionary.doc2bow(test_doc2) print(test_doc_bow2) print(lsi[test_doc_bow2]) lsi_cm=CoherenceModel(model=lsi,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v') LSI_cm=lsi_cm.get_coherence() LSI_cm from gensim.models.hdpmodel import HdpModel hdp = HdpModel(corpus=journals_corpus,id2word=journals_dictionary) hdp_topics = hdp.print_topics() for topic in hdp_topics: print(topic) test_doc = 'Journal of medicines and herbs' test_doc = custom_preprocess(test_doc) test_doc_bow = journals_dictionary.doc2bow(test_doc) print(test_doc_bow) print(hdp[test_doc_bow]) test_doc2 = 'Material and physics' test_doc2 = custom_preprocess(test_doc2) test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
def main(): parser = ArgumentParser( description= 'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information' ) parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki,es,file)') parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it') parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki') parser.add_argument('--model-id', default='model', help='Filename for created model.') parser.add_argument( '--model-type', default='lsi', help='Model type (lsi, lda, word2vec, hdp, vocabulary).') parser.add_argument('--n-topics', default=10, help='Number of topics to model.') parser.add_argument('--n-passes', default=1, help='Number of passes for LDA model.') parser.add_argument('--w2v-size', default=100, help='size of Word2Vec context.') parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.') parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents') parser.add_argument('--index', help='Elasticsearch: index to read from.') parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.') parser.add_argument( '--data-dir', help='Directory to save the generated models and vocabularies into.') parser.add_argument( '--vocab', help= 'Prebuilt Vocabulary file. Use this to avoid having to generate one.') opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ['es', 'wiki', 'file']: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ['wiki']: logging.error('--dump-file required for wiki dataset') sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == 'es' and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = '%s_%s_%d' % (model_id, model_type, n_topics) if data_dir: model_fn = '%s/%s' % (data_dir, model_fn) if model_type == 'word2vec': w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == 'es': logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es) elif data_type == 'wiki': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == 'file': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words('norwegian')) if not vocab_file or model_type == 'vocabulary': vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + '.vocab') else: vocab = Dictionary.load(vocab_file) if model_type == 'vocabulary': return tfidf = TfidfModel(dictionary=vocab) if model_type == 'lsi': corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == 'lda': corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == 'word2vec': corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == 'hdp': corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
def classify(k4=None): corpora_documents = [] label_level=[] filename="label_level01.txt" f=codecs.open("finalresult1.txt",'r',encoding="utf-8").readlines() f2=codecs.open(filename,'r',encoding="utf-8").readlines() y2,label_dict,n_x=y_label(filename) count1=12901 kfolds=10 kf = cross_validation.KFold(count1, n_folds=kfolds) for li in f: li=li.split() corpora_documents.append(li) for la in f2: la=la.split() label_level.append(la) corpora_documents=array(corpora_documents) label_level=array(label_level) sum_count=[] k=[];k1=[] k2=[];k3=[] for train_index, test_index in kf: #print(train_index, test_index) X_train, X_test = corpora_documents[train_index], corpora_documents[test_index] y_train, y_test = label_level[train_index], label_level[test_index] y2_train, y2_test = y2[train_index],y2[test_index] #生成字典和向量语料 dictionary = corpora.Dictionary(X_train) #dictionary.save('dictionary.dict') corpus = [dictionary.doc2bow(text) for text in X_train] tfidf=models.TfidfModel(corpus) corpus_tfidf=tfidf[corpus] # Lsi=models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=500) # corpus_lsi=Lsi[corpus_tfidf] # index=similarities.MatrixSimilarity(corpus_lsi) Lsi=HdpModel(corpus_tfidf,id2word=dictionary) corpus_lsi=Lsi[corpus_tfidf] index=similarities.MatrixSimilarity(corpus_lsi) count=0 y_prediction=zeros((len(test_index),n_x)) for i,li in enumerate(X_test): #li=li.split() test_corpus_1 = dictionary.doc2bow(li) test_corpus_tfidf=tfidf[test_corpus_1] test_corpus_Lsi=Lsi[test_corpus_tfidf] sims=index[test_corpus_Lsi] sort_sims=sorted(enumerate(sims),key=lambda x:-x[1]) print(sort_sims[:k4]) predictions={} for m,n in sort_sims[:k4]: for key in y_train[m]: if key not in predictions.keys(): predictions[key]=1 else: predictions[key]+=1 true_label=y_test[i] prediction=[] for key in predictions.keys(): if predictions[key]>=k4/2: prediction.append(key) true_label.sort() prediction.sort() # print("true label:",true_label) if len(prediction)==0: dict_sorted=sorted(predictions.items(),key=lambda x:x[1],reverse=True) prediction.append(dict_sorted[0][0]) if true_label==prediction: count+=1 #print(1) elif true_label==prediction: count+=1 #print(1) else: false=0 #print(0) # print("predict",prediction) # print(predictions) for label in prediction: # print('label_dict[label]:',label_dict[label]) # print(i,len(y_prediction[i])) y_prediction[i,label_dict[label]]=1 #print("#####################################") #print("count:",count) sum_count.append(count) hammingloss= sklearn.metrics.hamming_loss(y2_test, y_prediction) jaccard= sklearn.metrics.jaccard_similarity_score(y2_test, y_prediction) f1score= sklearn.metrics.f1_score(y2_test, y_prediction,average='micro') zerooneloss= sklearn.metrics.zero_one_loss(y2_test, y_prediction) print("hammingloss,jaccard,f1score,zerooneloss:", hammingloss, jaccard, f1score, zerooneloss) k.append(hammingloss) k1.append(jaccard) k2.append(zerooneloss) k3.append(f1score) print("hamming_loss mean:", array(k).mean()) print("hamming_loss var:", array(k).var()) print("jaccard mean:",array(k1).mean()) print("jaccard var:", array(k1).var()) print("f1_score mean:", array(k3).mean()) print("f1_score var:", array(k3).var()) print("zero_one_loss mean:", array(k2).mean()) print("zero_one_loss var:", array(k2).var())
# remove common words stoplist = set( 'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your' .split(',')) stop_ids = [ corpus.dictionary.token2id[stopword] for stopword in stoplist if stopword in corpus.dictionary.token2id ] corpus.dictionary.filter_tokens(stop_ids) # only keep the most frequent words corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000) # save stuff MmCorpus.serialize(out + '_bow.mm', corpus, progress_cnt=10000) corpus.dictionary.save_as_text(out + '_wordids.txt.bz2') # save memory dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2') del corpus # initialize corpus reader and word->id mapping mm = MmCorpus(out + '_bow.mm') # build tfidf tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(out + '.tfidf.model') MmCorpus.serialize(out + '_tfidf.mm', tfidf[mm], progress_cnt=10000) # Run hierarchical Dirichlet process over corpus hda = HdpModel(corpus=mm, id2word=dictionary) hda.save(out + ".hdm")
# Charge les données print("Loading data...") from loadjson import * from nltk.corpus import stopwords # Préparation des données stoplist = stopwords.words('french') + stopwords.words('english') + list('\'"`():,;.!?') docs = [[w for s in m['sents'] for w in s if w not in stoplist and w.isalnum() and len(w) >= 2] for m in mails.values()] id2word = Dictionary(docs) docs = [id2word.doc2bow(doc) for doc in docs] # Indexe les mots print("Training HDP model...") hdp = HdpModel(docs, id2word=id2word, max_time=10*60) hdp.save('hdp.gensim') id2word = hdp.id2word print("Exporting HDP model...") # Construction du tableau des associations mot-topic out_topicid = [] out_word = [] out_p = [] for topicid, words in hdp.show_topics(topics=-1, topn=500, formatted=False): for word, p in words: if p < 5e-3: break out_topicid.append(int(topicid)) out_word.append(word)
def fit_model(corpus,id2word,num_topics=20): # 训练模型 hdp = HdpModel(corpus=corpus, id2word=id2word) hdp.print_topics(num_topics) return hdp