def write_topics(model_path, csv_name, k): model = LdaModel.load(model_path) topics = [] for topic_id in range(model.num_topics): topics.append(model.return_topic(topicid=topic_id)) dictionary = Dictionary.load('data/dictionary/tweets.dict') word_indices = dictionary.id2token writer = csv.writer(file(csv_name, 'w')) output = [[0 for i in range(model.num_topics)] for j in range(k)] for topic_id, topic in enumerate(topics): for rank, index in enumerate(topic.argsort()[::-1]): output[rank][topic_id] = {} output[rank][topic_id]['word'] = word_indices[index] output[rank][topic_id]['p'] = topic[index] rank += 1 if rank >= k: break for topic_id in range(model.num_topics): row = ['z = ' + str(topic_id)] for rank in range(k): row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p'])) writer.writerow(row)
def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''): ''' Constructor ''' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.__destination = destination self.__fileName = fileName self.__modelName = modelName self.__ldaPasses = ldaPasses self.__topicNum = topicNum #======================================================================= # STOP WORDS AND CAHRACTERS #======================================================================= self.__stopwords = stopwords.words('english')# + string.punctuation self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u''] self.__stopwords.extend(self.__chars_to_remove) self.__stopwords.extend([item for item in string.punctuation]) #======================================================================= # DATABASE #======================================================================= self.__db = connectMySQL(db='xpath', port=3366) self.__queryResults = None self.__cleanedCorpus = [] if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'): self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'): self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
def create_evaluation_distinctiveness(config, Kind): model_fname = config.model_fname % Kind.__name__ try: model = LdaModel.load(model_fname) logger.info('Opened previously created model at file %s' % model_fname) except: error('Cannot evalutate LDA models not built yet!') scores = utils.score(model, utils.kullback_leibler_divergence) total = sum([x[1] for x in scores]) logger.info("%s model KL: %f" % (model_fname, total)) with open(config.path + 'evaluate-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, total]) etas = list() for topic in model.state.get_lambda(): topic_eta = list() for p_w in topic: topic_eta.append(p_w * numpy.log2(p_w)) etas.append(-sum(topic_eta)) entropy = sum(etas) / len(etas) logger.info("%s model entropy mean: %f" % (model_fname, entropy)) with open(config.path + 'evaluate-entropy-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, entropy])
def load(self): '''读取 lda 模型和 dic 词典. ''' lda_file = config.get('dmp', 'lda_file') dic_file = config.get('dmp', 'dic_file') self.lda = LdaModel.load(lda_file) self.dic = Dictionary.load(dic_file)
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lda.gz' if not os.path.exists(model_fname) or force: if corpus: update_every=None # run in batch if we have a pre-supplied corpus else: update_every=1 model = LdaModel(corpus=corpus, id2word=id2word, alpha=project.alpha, eta=project.eta, passes=project.passes, num_topics=project.num_topics, iterations=project.iterations, eval_every=None, # disable perplexity tests for speed update_every=update_every, ) if corpus: model.save(model_fname) else: model = LdaModel.load(model_fname) return model, model_fname
def calculateLDADistance(self, modelName='', topNSimilar='', topicList=''): if modelName=='': modelName=self.__fileName if topNSimilar=='': topNSimilar=5 write2file = self.__destination+modelName+"_results_LDA_similarTopics.csv" resultsCSV = open(write2file, "wb") print 'Reading model data' gensimDict = corpora.Dictionary.load(self.__destination+self.__fileName+'.dict') ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) topics = ldaModel.show_topics(num_topics=ldaModel.num_topics, num_words=len(gensimDict),formatted=False) #======================================================================= # num_topics=ldaModel.num_topics # num_words=len(gensimDict) #======================================================================= #======================================================================= # GET SIMILARITY VECTORS #======================================================================= print 'Extractig vectors' topicsSorted = [sorted(x, key=lambda x: x[1]) for x in topics] vectors = [] for topic in topicsSorted: vector = [item[0] for item in topic] vectors.append(vector) #======================================================================= # CALCULATE SIMILARITIES BETWEEN TOPICS #======================================================================= print 'Calculating distances between LDA topics\n' results = [] for topicListItem in topicList: distances = [] for j in range (0, len(vectors)): dist = euclidean(vectors[topicListItem], vectors[j]) #=============================================================== # print topicListItem, j, dist #=============================================================== distances.append(dist) results.append(distances) #======================================================================= # EXPORT TOP N SIMILAR TOPICS NAD PRINT OUT QUERY TERMS #======================================================================= print 'Writing found similar topics to file\n' for resultItem in range(0,len(results)): similarLDATopics = np.argsort(results[resultItem])[::-1] for similarItem in similarLDATopics[:topNSimilar]: #=============================================================== # print topicList[resultItem],similarItem #=============================================================== resultsCSV.write(str(topicList[resultItem])+'; '+str(similarItem)+'; '+', '.join(x[1].lstrip().rstrip() for x in topics[similarItem][:100])+'\n\n') resultsCSV.write('*******************************************\n\n')
def evaluate_log(context, config): logger.info('Evalutating models for: %s' % config.project.name) model_fname = config.model_fname % ChangesetCorpus.__name__ changeset_fname = config.corpus_fname % ChangesetCorpus.__name__ commit_fname = config.corpus_fname % CommitLogCorpus.__name__ try: commit_id2word = Dictionary.load(commit_fname + '.dict') commit_corpus = MalletCorpus(commit_fname, id2word=commit_id2word) changeset_id2word = Dictionary.load(changeset_fname + '.dict') changeset_corpus = MalletCorpus(changeset_fname, id2word=changeset_id2word) except: error('Corpora not built yet -- cannot evaluate') try: model = LdaModel.load(model_fname) logger.info('Opened previously created model at file %s' % model_fname) except: error('Cannot evalutate LDA models not built yet!') changeset_doc_topic = get_doc_topic(changeset_corpus, model) commit_doc_topic = get_doc_topic(commit_corpus, model) first_shared = dict() for id_ in commit_doc_topic: i = 0 commit_topics = [topic[0] for topic in commit_doc_topic[id_]] try: changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]] except: continue maximum = 101 minimum = maximum for i, topic in enumerate(commit_topics): if topic in changeset_topics: j = changeset_topics.index(topic) minimum = min(minimum, max(i, j)) for i, topic in enumerate(changeset_topics): if topic in commit_topics: j = commit_topics.index(topic) minimum = min(minimum, max(i, j)) first_shared[id_] = minimum if minimum == maximum: logger.info('No common topics found for %s' % str(id_)) del first_shared[id_] mean = sum(first_shared.values()) / len(first_shared) with open('data/evaluate-log-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, mean] + list(first_shared.values()))
def __init__(self): cwd = os.path.dirname(__file__) dictionary_path = os.path.abspath(os.path.join(cwd, 'models/dictionary.dict')) lda_model_path = os.path.abspath(os.path.join(cwd, 'models/lda_model_10_topics.lda')) self.dictionary = corpora.Dictionary.load(dictionary_path) self.lda = LdaModel.load(lda_model_path)
def load_lda_model(lda_model_name=None, mallet=False): if os.path.isfile(lda_model_name): if mallet: lda_model = LdaMallet.load(lda_model_name) else: lda_model = LdaModel.load(lda_model_name) return lda_model return None
def analyzeUniqueLDA(self, modelName='', numberOfTerms=''): ''' modelName -> name of model to read in to memory without the extension ''' if modelName=='': modelName=self.__fileName if numberOfTerms=='': numberOfTerms=100 write2File = self.__destination+modelName+"_results_unique_%sTerms.csv"%(numberOfTerms) resultsCSV = open(write2File, "wb") model = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) #and another way, only prints top words for t in range(0, model.num_topics-1): #=================================================================== # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, 500)]) #=================================================================== # raw_input('prompt') topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords] #=================================================================== # print type(topicSet), topicSet #=================================================================== listSet = set(topicSet) #print listSet #print type(topicSet), topicSet for key in self.__queryWords: #print self.__queryWords[key] difference = set(topicSet).intersection(self.__queryWords[key]) if len(difference) > 0: self.__overlapingTopics[key][t]=topicSet try: for key in self.__overlapingTopics: uniqueQueryTerms = [] if self.__overlapingTopics[key]: for topicKey in self.__overlapingTopics[key]: topicTerms = [w for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords] uniqueQueryTerms.extend(topicTerms) uniqueQueryTerms = [x for x in set(uniqueQueryTerms)] resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(uniqueQueryTerms)+'\n\n') resultsCSV.write('***************************************\n') print key, uniqueQueryTerms print '*************************\n' except KeyError as e: print e pass resultsCSV.close()
def analyzeLDA(self, modelName='', numberOfTerms=''): ''' modelName -> name of model to read in to memory without the extension ''' if modelName=='': modelName=self.__fileName if numberOfTerms == '': numberOfTerms=100 write2file = self.__destination+modelName+"_results_%s_SW.csv"%(numberOfTerms) #======================================================================= # allTopicsFile = self.__destination+modelName+"_results_AllTopics.csv" #======================================================================= resultsCSV = open(write2file, "wb") model = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) #and another way, only prints top words for t in range(0, model.num_topics-1): #=================================================================== # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, numberOfTerms)]) #=================================================================== topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords] listSet = set(topicSet) for key in self.__queryWords: difference = set(topicSet).intersection(self.__queryWords[key]) if len(difference) > 0: self.__overlapingTopics[key][t]=topicSet try: for key in self.__overlapingTopics: if self.__overlapingTopics[key]: for topicKey in self.__overlapingTopics[key]: topicTerms = [w.lstrip().rstrip() for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords][:100] #======================================================= # topicTerms = [w.translate(None, ''.join(self.__chars_to_remove)) for w in topicTerms if w !=''] #======================================================= resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(topicTerms)+'\n\n') print key,'\t',topicKey,'\t', topicTerms resultsCSV.write('***************************************\n') print '*************************\n' write2fileJSON = self.__destination+modelName+"_results_%s_SW.json"%(numberOfTerms) with open(write2fileJSON, 'w') as fp: json.dump(self.__overlapingTopics, fp) except KeyError as e: print e pass resultsCSV.close()
def get_keywords(threshold=0.01, model_path='result/model.lda'): lda_model = LdaModel.load(model_path) topic_num = lda_model.num_topics keywords = set() for topic_id in range(topic_num): topic = lda_model.state.get_lambda()[topic_id] topic = topic / topic.sum() # normalize to probability dist signif_word_ids = np.where(topic > threshold)[0] keywords = keywords.union([lda_model.id2word[word_id] for word_id in signif_word_ids]) return keywords
def __init__(self): # current_working_dir = '/home/etu/eason/nodejs/Semantic_Aware_RecSys' current_working_dir = '.' os.chdir(current_working_dir) lda_model_path = "./LDAmodel/final_ldamodel" self.lda = LdaModel.load(lda_model_path) self.no_of_recommendation = 10 self.omit_topic_below_this_fraction = 0.1 self.mapping = self.__init_mapping() self.linkMapping = self.__init_Link_mapping() self.doc_topic_matrix = loadPickleFile('doc_topic_matrix')
def getAllTopics(self, modelName='', numberOfTerms=100): ''' modelName -> name of model to read in to memory without the extension ''' returningData = {} if modelName=='': modelName=self.__fileName model = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) return model.show_topics(num_topics=model.num_topics,num_words=numberOfTerms, formatted=False)
def __init__(self, categ, lda_num_topics): """ Initialize Predict class """ collection_name = '%s_corpus' % categ dictionary_path = os.path.join(src_dir, 'models/dictionary_' + categ + '.dict') lda_model_path = os.path.join(dst_dir, 'models/lda_model_' + str(lda_num_topics) +'_topics_' + categ + '.lda') self.categ = categ self.collection_name = collection_name self.lda_num_topics = lda_num_topics self.dictionary = corpora.Dictionary.load(dictionary_path) self.lda = LdaModel.load(lda_model_path) self.stopwords = stopwords.words('english') self.lem = WordNetLemmatizer() self.tokenizer = regexp.RegexpTokenizer("[\w’]+", flags=re.UNICODE)
def worker(its, rows_array, lock, counter, n): model = LdaModel.load('tweets_es_25topics.lda') dictionary = corpora.Dictionary.load('tweets_es.dict') for i, t in its: tokens = tokenize(preprocess(t.text)) doc_bow = dictionary.doc2bow(tokens) doc_lda = model[doc_bow] feats = doc_lda lock.acquire() rows_array[i] = feats counter.value += 1 if i % 50 == 0: sofar = counter.value perc = sofar * 100.0 / n print ("%.2f %% so far" % perc ) lock.release()
def display(categ, lda_num_topics): u'''Display hidden topics''' lda_model_path = os.path.join(dst_dir, 'models/lda_model_' + str(lda_num_topics) +'_topics_' + categ + '.lda') lda = LdaModel.load(lda_model_path) top_list = lda.show_topics(num_topics=lda_num_topics, num_words=20, log=False, formatted=True) index = 0 for top in top_list: index += 1 print index, #scores = [] #words = [] topwords = top.split(' + ') for topword in topwords: member = topword.split('*') print member[1], #words.append(member[1]) #scores.append(member[0]) print ''
def updateModel_LDA(self, dictname, modelName): #======================================================================= # GET LAST MODIFIED DATE IN MYSQL FORMAT #======================================================================= #======================================================================= # GET NEW DOCUMENTS SINCE LAST MODIFIED DATE AND PREPARE THEM #======================================================================= modelModified = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(os.path.getmtime(self.__destination+modelName+'.lda') )) self.getCorpusFromDB(modelModified) self.cleanPreparedCorpus() #======================================================================= # UPDATE DICT, CORPUT AND LDA MODEL WITH NEW DOCUMENTS #======================================================================= oldDict = corpora.Dictionary.load(self.__destination+dictname+'.dict') newCorpora = [oldDict.doc2bow(text) for text in self.__cleanedCorpus] #============================================================================== # oldContent = self.__destination+dictname+'.mm' # print oldContent # # oldDict = corpora.Dictionary.load(self.__destination+dictname+'.dict') # newDict = corpora.Dictionary(self.__cleanedCorpus) # mergedDict = oldDict.merge_with(newDict) # # oldCorpora = corpora.MmCorpus(self.__destination+dictname+'.mm') # newCorpora = [newDict.doc2bow(text) for text in self.__cleanedCorpus] # mergedCorpus = itertools.chain(oldCorpora,mergedDict[newCorpora]) # # mergedDict.save(self.__destination+dictname+'.dict') # corpora.MmCorpus.serialize(self.__destination+dictname+'.mm', mergedCorpus) #============================================================================== #======================================================================= # dict = corpora.Dictionary.load(self.__destination+modelName+'.dict') # mm = corpora.MmCorpus(self.__destination+modelName+'.mm') #======================================================================= ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') ldaModel.update(newCorpora) ldaModel.save(self.__destination+modelName+'.lda')
def __init__(self, ldaModelFile, dictionaryfile, stopfile="english.stop.txt"): ''' Const Parameters: ldaModelFile: the model file that was trained dictionaryfile: id2word mapping file ''' logging.info("[Start] Loading the dictionary " + dictionaryfile) self.id2word = Dictionary.load(dictionaryfile) logging.info("[Stop] Loading the dictionary " + dictionaryfile) logging.info("[Start] Loading the model file " + ldaModelFile) self.ldamodel = LdaModel.load(ldaModelFile) logging.info("[Done] Loading the model file " + ldaModelFile) logging.info("[Start] Loading all topics") self.alltopics = self.ldamodel.show_topics(-1) logging.info("[Start] Loading all topics") self.stopwords = self.loadStop(stopfile)
def run(new_review, rate): tokenizer = nltk.RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = nltk.PorterStemmer() letters_only = nltk.re.sub("[^a-zA-Z]", " ", new_review) raw = letters_only.lower() tokens = tokenizer.tokenize(raw) words = [] tagged_text = nltk.pos_tag(tokens) for word, tag in tagged_text: words.append({"word": word, "pos": tag}) lem = WordNetLemmatizer() nouns = [] for word in words: if word["pos"] in ["NN", "NNS"]: nouns.append(lem.lemmatize(word["word"])) stopped_tokens = [i for i in nouns if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] dictionary = corpora.Dictionary.load("dictionary.dict") lda = LdaModel.load("lda_model_50_topics.lda") new_review_bow = dictionary.doc2bow(stemmed_tokens) new_review_lda = lda[new_review_bow] for i in new_review_lda: topic = i[0] probability = i[1] if topic in topics: topicName = topics.get(topic) ratingSum[topicName] = ratingSum.get(topicName) + probability * rate denominator[topicName] = denominator.get(topicName) + probability
import logging from gensim.models import LdaModel from gensim import corpora logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary_path = "models/dictionary.dict" corpus_path = "models/corpus.lda-c" lda_num_topics = 50 lda_model_path = "models/lda_model_50_topics.lda" dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaModel.load(lda_model_path) i = 0 for topic in lda.show_topics(lda_num_topics): print '#' + str(i) + ': ' + topic i += 1
from gensim.models import LdaModel from gensim import corpora import nltk import string import Config dictionary = corpora.Dictionary.load(Config.DICTIONARY_LOCAL) lda = LdaModel.load(Config.LDA_LOCAL) def clean_review(review): stopwords = set(nltk.corpus.stopwords.words('english')) wnl = nltk.WordNetLemmatizer() counter = 0 cleaned_review_words = [] # lower case and remove punctiation punctuation = set(string.punctuation) review_text = (''.join([c for c in review.lower() if not c in punctuation])) sentences = nltk.sent_tokenize(review_text) for sentence in sentences: words = nltk.word_tokenize(sentence) for w in words: if w not in stopwords: # consider tagging POS, but decreases performance drastically cleaned_review_words.append(wnl.lemmatize(w)) return cleaned_review_words def find_topics_in_review(cleaned_text):
def use_lda_model(documents=[],model='',trained_dictionary='',verbose=True): global MODEL_FILENAME #> Again, extend https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization #NOTES: #- allow user to pass reusable model/dictionary if not model: if not os.path.exists(MODEL_FILENAME): print ("Training new model: "+MODEL_FILENAME) train_model_basic() model = LdaModel.load(MODEL_FILENAME) if not trained_dictionary: #Use trained dictionary to lookup bow trained_dictionary = corpora.Dictionary.load(TOPIC_DICTIONARY_FILENAME) print ("Trained dict length: "+str(len(trained_dictionary))) if not documents: print ("**USING DEFAULT DEMO DOCS...") documents=['The world bank and the world bank women economic market'] documents+= [ "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", ] #Combine# documents=[" ".join(documents)] #1/ Use trained dictionary to lookup bow and return standard text documents,other_texts,other_corpus,common_dictionary=get_corpora(documents=documents,common_dictionary=trained_dictionary,verbose=False) #2/ Docs vectors to topic vectors topic_vectors = model[other_corpus] #3/ Review documents doc_topics=[] for i,document in enumerate(documents): if verbose: print ("Given document: "+str(document)) print ("Given tokenized: "+str(other_texts[i])) print ("Using stemmed: "+str(other_corpus[i])) #/ transform topic into top list topic_matches=[] for tnum,percent in topic_vectors[i]:topic_matches+=[(tnum,percent)] topic_matches.sort(key=lambda x:x[1],reverse=True) #/ Iter top topic matches (break at 0) tnum=-1;percent=0;topic_lable='' for tnum,percent in topic_matches: topic_label=describe_topic(model,trained_dictionary,tnum) if verbose: print ("Topic #"+str(tnum)+" match "+str(percent)+"% >topic label: "+topic_label) doc_topics+=[(tnum,percent,topic_label)] break if verbose: print if 'review_topic_distribution_similarity_content' in '': lda_corpus1 = model[corpus1] top_dist1, _ = get_doc_topic_dist(model, lda_corpus1) #> possible cosine_sim...https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization print ("**assumption: Topic label is first 3 most salient terms of topic cluster.") return doc_topics,model,trained_dictionary #For reuse
""" Computing similarity between each category. """ import logging from gensim.models import LdaModel from sklearn.metrics.pairwise import cosine_similarity from utils.util import enpickle __author__ = 'kensk8er' if __name__ == '__main__': # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) logging.info('Loading the model...') model = LdaModel.load('result/model_wiki.lda') topics = [] for topic_id in range(model.num_topics): topics.append(model.return_topic(topicid=topic_id)) similarity = cosine_similarity(topics) enpickle(similarity, 'result/topic_similarity/lda_wiki.pkl')
from pymongo import MongoClient __author__ = 'Parry' from gensim.models import LdaModel from gensim import corpora from Constants import Parameters dictionary = corpora.Dictionary.load(Parameters.Dictionary_path) corpus = corpora.BleiCorpus(Parameters.Corpus_path) lda = LdaModel.load(Parameters.Lda_model_path) corpus_collection = MongoClient(Parameters.MONGO_CONNECTION_STRING)[Parameters.REVIEWS_DATABASE][Parameters.CORPUS_COLLECTION] i=0 corpus_cursor = corpus_collection.find() for review in corpus_cursor: # assume there's one document per line, tokens separated by whitespace i=i+1 print lda[dictionary.doc2bow(review["words"])] if i ==20: break;
#!/usr/bin/env python from gensim.models import LdaModel from gensim.corpora import MmCorpus, Dictionary import sys, os import pyLDAvis.gensim if len(sys.argv) < 2: print("usage: {0} [path to model.lda]\n".format(sys.argv[0])) sys.exit(1) path, file = os.path.split(sys.argv[1]) corpusname = file.split(".")[0] dictionary = Dictionary.load(path+"/"+corpusname+".dict") corpus = MmCorpus(path+"/"+corpusname+".mm") model = LdaModel.load(sys.argv[1]) ############## # cf. https://pyldavis.readthedocs.org/en/latest/modules/API.html vis = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(vis, path+"/"+corpusname+"_interactive.html") pyLDAvis.show(vis)
######################################################################## doc_labels = [] # get document labels print("\n get labels \n") with open(os.path.join(path, "corpus_doclabels.txt"), "r") as f: for line in f: doc_labels.append(line) # load corpus print("\n load corpus \n") corpus = MmCorpus(os.path.join(path,"corpus.mm")) # load model print("\n load model \n") model = LdaModel.load(os.path.join(path,"corpus.lda")) no_of_topics = model.num_topics no_of_docs = len(doc_labels) ######################################################################## # get doc-topic matrix ######################################################################## doc_topic = np.zeros((no_of_docs, no_of_topics)) for doc, i in zip(corpus, range(no_of_docs)): topic_dist = model.__getitem__(doc) # to get topic distribution from model for topic in topic_dist: # topic_dist is a list of tuples (topic_id, topic_prob) doc_topic[i][topic[0]] = topic[1] # save topic probability
def calculate_lda(dataset_raw, n_topics=10, lda_model_name="", mallet=True, mallet_path="/Users/verasazonova/no-backup/JARS/mallet-2.0.7/bin/mallet", dataname="none"): with open(dataname+"_log.txt", 'a') as fout: if dataset_raw.include_date: dates = [text[1] for text in dataset_raw] dataset = [normalize_words(text[0].split(), dataset_raw.stoplist) for text in dataset_raw] else: dates = ["" for _ in dataset_raw] dataset = dataset_raw bi_grams = Phrases(dataset, threshold=3) dataset = bi_grams[dataset] dictionary = Dictionary(dataset) dictionary.filter_extremes(no_below=1, no_above=0.9) bow_corpus = [dictionary.doc2bow(text) for text in dataset] fout.write("# Topics: %s\n" % n_topics) if not os.path.isfile(lda_model_name): if mallet: lda_model = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=n_topics, id2word=dictionary, workers=4, optimize_interval=10, iterations=1000) lda_model_name = "lda_model_mallet_%s_%i" % (dataname, n_topics) else: lda_model = LdaModel(bow_corpus, id2word=dictionary, num_topics=n_topics, distributed=False, chunksize=2000, passes=5, update_every=10, alpha='asymmetric', eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001) lda_model_name = "lda_model_%s_%i" % (dataname, n_topics) lda_model.save(lda_model_name) else: if mallet: lda_model = LdaMallet.load(lda_model_name) else: lda_model = LdaModel.load(lda_model_name) topic_definition = [] for i, topic in enumerate(lda_model.show_topics(n_topics, num_words=20, formatted=False)): fout.write("%i \n" % i) topic_list = [] freq_list = [] a_list = [] for tup in topic: topic_list.append(tup[1]) freq_list.append(dictionary.dfs[ dictionary.token2id[tup[1]] ] ) a_list.append(tup[0]) fout.write( "%s\n\n" % repr((sorted(zip(topic_list, freq_list), key=itemgetter(1) )))) topic_definition.append("%i, %s" %(i, repr(" ".join(sorted(topic_list)))[2:-1])) fout.write("Total number of documents: %i\n" % dictionary.num_docs ) earliest_date = dateutil.parser.parse("Sun Jun 08 00:00:00 +0000 2014") a = [tup for tup in sorted(zip(bow_corpus, dates), key=get_date ) if dateutil.parser.parse(tup[1]) > earliest_date] print len(a) print a[len(a)-1] latest_date = dateutil.parser.parse(a[len(a)-1][1]) num_bins = 100 time_span = latest_date - earliest_date print time_span time_bin = time_span / num_bins print time_bin bin_lows = [earliest_date] bin_high = earliest_date + time_bin counts = [[0 for _ in range(n_topics)] for _ in range(num_bins+1)] i=0 for text in a: topic_assignments = lda_model[text[0]] date_str = text[1] if date_str is not None: cur_date = dateutil.parser.parse(date_str) if cur_date >= bin_high: i+=1 bin_lows.append(bin_high) bin_high = bin_lows[len(bin_lows)-1] + time_bin #counts[i][max(topic_assignments, key=itemgetter(1))[0]] += 1 for tup in topic_assignments: counts[i][tup[0]] += tup[1] fout.write("Number of documents assigned mostly to the topic: \n") fout.write("%s\n" % counts) a = 1.*np.array(counts) np.savetxt("mpeketoni_cnts.txt", a) with open("mpeketoni_bins.txt", 'w') as fout: for date in bin_lows: fout.write("%s\n" % date) with open("mpeketoni_labels.txt", 'w') as fout: for label in topic_definition: fout.write("%s\n" % label) return a, bin_lows, topic_definition
from TechDashAPI.mysqlUtilities import connectMySQL from TechDashAPI.ContentExtractor import ContentExtractor from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer from TechDashAPI.createDOM import createDom from TechDashAPI.util import utilities from TechDashAPI.topicModeling import techDashTopicModel from gensim.models import LdaModel db = connectMySQL(db='xpath', port=3366) filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/' utilitiesFunctions = utilities() modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/' modelName ='fullModel_100P_20T' model = LdaModel.load(modelDestination+modelName+'.lda', mmap=None) topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T') #=============================================================================== # UPDATE ALL ARTICLES TO NEW TOPICS #=============================================================================== sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """ db.executeQuery(sqlQuery) for item in db._connectMySQL__results: #=========================================================================== # print item #=========================================================================== topicModelCat = topicModel.getDocumentTopics(item[1])
def __init__(self): print "Loading LDA model." self.model = LdaModel.load('big_lda_model', mmap=None) self.num_features = 100 self.dictionary = Dictionary.load('big_wiki_subset_dict.dict') print "LDA model loaded."
X = vectorizer.fit_transform(res) vocab = vectorizer.get_feature_names() start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X,documents_columns=False), num_topics=9,passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) fname = '/Users/royyang/Desktop/trending_project/re_categorization_ls/LDA_9topics' model.save(fname) #Load a pretrained model model = LdaModel.load(fname, mmap='r') type(model) #============================================================================== # # Get all topics from training # topic_number, number_of_aritcles, top_words #============================================================================== def get_topic(n): doc_lda = model[doc_list[n]] current_prob = 0 for var in doc_lda: if var[1]>current_prob: current_prob = var[1] topic_num = var[0] return topic_num,re.sub('[+.0123456789\*]','',topic[topic_num])
def __init__(self): dictionary_path = "topics_labels/models/dictionary.dict" lda_model_path = "topics_labels/models/lda_model_50_topics.lda" self.dictionary = corpora.Dictionary.load(dictionary_path) self.lda = LdaModel.load(lda_model_path)
passes=passes, eval_every=eval_every) # Calculate coherence coherence_model_lda = CoherenceModel(model=reasonsModel, texts=reasonsBigram, dictionary=reasonsDictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('Num Topics: ', num_topics, 'coherence is: ', coherence_lda) reasonsModel.save( 'C:/Users/matth/Documents/GitHub/Survey-Nonresponders/Data/reasonsModel_save' ) reasonsModel = LdaModel.load( 'C:/Users/matth/Documents/GitHub/Survey-Nonresponders/Data/reasonsModel_save' ) for idx, topic in reasonsModel.print_topics(-1, 50): print("Topic: {} \nWords: {}".format(idx, topic)) print("\n") ################################################################################################ #Now run the suggestions with one of the identified "sweet spots" and then get topic words # ################################################################################################ chunksize = 1541 passes = 1000 iterations = 100000 eval_every = None # evaluate perplexity num_topics = 8