Ejemplo n.º 1
0
def write_topics(model_path, csv_name, k):
    model = LdaModel.load(model_path)
    topics = []
    for topic_id in range(model.num_topics):
        topics.append(model.return_topic(topicid=topic_id))

    dictionary = Dictionary.load('data/dictionary/tweets.dict')
    word_indices = dictionary.id2token
    writer = csv.writer(file(csv_name, 'w'))

    output = [[0 for i in range(model.num_topics)] for j in range(k)]
    for topic_id, topic in enumerate(topics):
        for rank, index in enumerate(topic.argsort()[::-1]):
            output[rank][topic_id] = {}
            output[rank][topic_id]['word'] = word_indices[index]
            output[rank][topic_id]['p'] = topic[index]
            rank += 1
            if rank >= k:
                break

    for topic_id in range(model.num_topics):
        row = ['z = ' + str(topic_id)]

        for rank in range(k):
            row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p']))

        writer.writerow(row)
Ejemplo n.º 2
0
    def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''):
        '''
        Constructor
        '''
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        self.__destination = destination
        self.__fileName = fileName
        self.__modelName = modelName
        self.__ldaPasses = ldaPasses
        self.__topicNum = topicNum
                
        #=======================================================================
        # STOP WORDS AND CAHRACTERS
        #=======================================================================
        self.__stopwords = stopwords.words('english')# + string.punctuation
        self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u'']
        self.__stopwords.extend(self.__chars_to_remove)
        self.__stopwords.extend([item for item in string.punctuation])

        #=======================================================================
        # DATABASE
        #=======================================================================
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__queryResults = None
        self.__cleanedCorpus = []
        

        if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'):
            self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') 
            
        if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'):
            self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
def create_evaluation_distinctiveness(config, Kind):
    model_fname = config.model_fname % Kind.__name__

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    scores = utils.score(model, utils.kullback_leibler_divergence)
    total = sum([x[1] for x in scores])

    logger.info("%s model KL: %f" % (model_fname, total))
    with open(config.path + 'evaluate-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, total])

    etas = list()
    for topic in model.state.get_lambda():
        topic_eta = list()
        for p_w in topic:
            topic_eta.append(p_w * numpy.log2(p_w))
            etas.append(-sum(topic_eta))

    entropy = sum(etas) / len(etas)

    logger.info("%s model entropy mean: %f" % (model_fname, entropy))
    with open(config.path + 'evaluate-entropy-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, entropy])
Ejemplo n.º 4
0
Archivo: dmp.py Proyecto: npiaq/dmp
 def load(self):
     '''读取 lda 模型和 dic 词典.
     '''
     lda_file = config.get('dmp', 'lda_file')
     dic_file = config.get('dmp', 'dic_file')
     self.lda = LdaModel.load(lda_file)
     self.dic = Dictionary.load(dic_file)
Ejemplo n.º 5
0
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lda.gz'


    if not os.path.exists(model_fname) or force:
        if corpus:
            update_every=None # run in batch if we have a pre-supplied corpus
        else:
            update_every=1

        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         alpha=project.alpha,
                         eta=project.eta,
                         passes=project.passes,
                         num_topics=project.num_topics,
                         iterations=project.iterations,
                         eval_every=None, # disable perplexity tests for speed
                         update_every=update_every,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LdaModel.load(model_fname)

    return model, model_fname
Ejemplo n.º 6
0
    def calculateLDADistance(self, modelName='', topNSimilar='', topicList=''):
        
        if modelName=='':
            modelName=self.__fileName
    
        if topNSimilar=='':
            topNSimilar=5       
            
        write2file = self.__destination+modelName+"_results_LDA_similarTopics.csv"
        resultsCSV = open(write2file, "wb")
        
        print 'Reading model data'
        gensimDict = corpora.Dictionary.load(self.__destination+self.__fileName+'.dict')
        ldaModel = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
        topics = ldaModel.show_topics(num_topics=ldaModel.num_topics, num_words=len(gensimDict),formatted=False)
        #=======================================================================
        # num_topics=ldaModel.num_topics                             
        # num_words=len(gensimDict)
        #=======================================================================
        
        #=======================================================================
        # GET SIMILARITY VECTORS
        #=======================================================================
        print 'Extractig vectors'
        topicsSorted = [sorted(x,  key=lambda x: x[1]) for x in topics]
        vectors = []
            
        for topic in topicsSorted:
            vector = [item[0] for item in topic]
            vectors.append(vector)

        #=======================================================================    
        # CALCULATE SIMILARITIES BETWEEN TOPICS
        #=======================================================================
        print 'Calculating distances between LDA topics\n'
        results = []
        for topicListItem in topicList:
            distances = []
            for j in range (0, len(vectors)):
                dist = euclidean(vectors[topicListItem], vectors[j])
                #===============================================================
                # print topicListItem, j, dist
                #===============================================================
                distances.append(dist)
            results.append(distances)

        #=======================================================================
        # EXPORT TOP N SIMILAR TOPICS NAD PRINT OUT QUERY TERMS
        #=======================================================================
        print 'Writing found similar topics to file\n'
        for resultItem in range(0,len(results)):
            similarLDATopics = np.argsort(results[resultItem])[::-1]
              
            for similarItem in similarLDATopics[:topNSimilar]:
                #===============================================================
                # print topicList[resultItem],similarItem
                #===============================================================
                resultsCSV.write(str(topicList[resultItem])+'; '+str(similarItem)+'; '+', '.join(x[1].lstrip().rstrip() for x in topics[similarItem][:100])+'\n\n')
            resultsCSV.write('*******************************************\n\n')
def evaluate_log(context, config):
    logger.info('Evalutating models for: %s' % config.project.name)

    model_fname = config.model_fname % ChangesetCorpus.__name__
    changeset_fname = config.corpus_fname % ChangesetCorpus.__name__
    commit_fname = config.corpus_fname % CommitLogCorpus.__name__

    try:
        commit_id2word = Dictionary.load(commit_fname + '.dict')
        commit_corpus = MalletCorpus(commit_fname,
                                     id2word=commit_id2word)
        changeset_id2word = Dictionary.load(changeset_fname + '.dict')
        changeset_corpus = MalletCorpus(changeset_fname,
                                        id2word=changeset_id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    changeset_doc_topic = get_doc_topic(changeset_corpus, model)
    commit_doc_topic = get_doc_topic(commit_corpus, model)

    first_shared = dict()
    for id_ in commit_doc_topic:
        i = 0
        commit_topics = [topic[0] for topic in commit_doc_topic[id_]]
        try:
            changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]]
        except:
            continue

        maximum = 101
        minimum = maximum

        for i, topic in enumerate(commit_topics):
            if topic in changeset_topics:
                j = changeset_topics.index(topic)
                minimum = min(minimum, max(i, j))

        for i, topic in enumerate(changeset_topics):
            if topic in commit_topics:
                j = commit_topics.index(topic)
                minimum = min(minimum, max(i, j))

        first_shared[id_] = minimum

        if minimum == maximum:
            logger.info('No common topics found for %s' % str(id_))
            del first_shared[id_]

    mean = sum(first_shared.values()) / len(first_shared)

    with open('data/evaluate-log-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, mean] + list(first_shared.values()))
Ejemplo n.º 8
0
  def __init__(self):

    cwd = os.path.dirname(__file__)
    dictionary_path = os.path.abspath(os.path.join(cwd, 'models/dictionary.dict'))
    lda_model_path = os.path.abspath(os.path.join(cwd, 'models/lda_model_10_topics.lda'))

    self.dictionary = corpora.Dictionary.load(dictionary_path)
    self.lda = LdaModel.load(lda_model_path)
Ejemplo n.º 9
0
def load_lda_model(lda_model_name=None, mallet=False):
    if os.path.isfile(lda_model_name):
        if mallet:
            lda_model = LdaMallet.load(lda_model_name)
        else:
            lda_model = LdaModel.load(lda_model_name)
        return lda_model
    return None
Ejemplo n.º 10
0
    def analyzeUniqueLDA(self, modelName='', numberOfTerms=''):
        '''
        modelName -> name of model to read in to memory without the extension
        '''
        
        if modelName=='':
            modelName=self.__fileName
            
        if numberOfTerms=='':
            numberOfTerms=100
            
        write2File = self.__destination+modelName+"_results_unique_%sTerms.csv"%(numberOfTerms)
        resultsCSV = open(write2File, "wb")
        
        model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)

         
        #and another way, only prints top words 
        for t in range(0, model.num_topics-1):
            #===================================================================
            # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, 500)])
            #===================================================================
            # raw_input('prompt')
            topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords]
            #===================================================================
            # print type(topicSet), topicSet
            #===================================================================
            listSet = set(topicSet)
            #print listSet
            #print type(topicSet), topicSet
            for key in self.__queryWords:  
                #print self.__queryWords[key]
                difference = set(topicSet).intersection(self.__queryWords[key])
                 
                if len(difference) > 0:
                    self.__overlapingTopics[key][t]=topicSet
        
        try:
            for key in self.__overlapingTopics:
                uniqueQueryTerms = []
                if self.__overlapingTopics[key]:
                    for topicKey in self.__overlapingTopics[key]:
                        topicTerms = [w for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords]
                        uniqueQueryTerms.extend(topicTerms)
                        
                uniqueQueryTerms = [x for x in set(uniqueQueryTerms)]
                resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(uniqueQueryTerms)+'\n\n')
                resultsCSV.write('***************************************\n')
                print key, uniqueQueryTerms
                print '*************************\n'

        except KeyError as e: 
            print e
            pass 
        
        resultsCSV.close()
Ejemplo n.º 11
0
    def analyzeLDA(self, modelName='', numberOfTerms=''):
        '''
        modelName -> name of model to read in to memory without the extension
        '''
        
        if modelName=='':
            modelName=self.__fileName
            
        if numberOfTerms == '':
            numberOfTerms=100
            
        write2file = self.__destination+modelName+"_results_%s_SW.csv"%(numberOfTerms)
        #=======================================================================
        # allTopicsFile = self.__destination+modelName+"_results_AllTopics.csv"
        #=======================================================================
        
        resultsCSV = open(write2file, "wb")
        model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
         
        #and another way, only prints top words 
        for t in range(0, model.num_topics-1):
            #===================================================================
            # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, numberOfTerms)])
            #===================================================================

            topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords]
            listSet = set(topicSet)

            for key in self.__queryWords:  
                difference = set(topicSet).intersection(self.__queryWords[key])
                 
                if len(difference) > 0:
                    self.__overlapingTopics[key][t]=topicSet
        
        try:
            for key in self.__overlapingTopics:
                if self.__overlapingTopics[key]:
                    for topicKey in self.__overlapingTopics[key]:
                        topicTerms = [w.lstrip().rstrip() for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords][:100]
                        #=======================================================
                        # topicTerms = [w.translate(None, ''.join(self.__chars_to_remove)) for w in topicTerms if w !='']
                        #=======================================================
                        resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(topicTerms)+'\n\n')
                        print key,'\t',topicKey,'\t', topicTerms
                    resultsCSV.write('***************************************\n')
                print '*************************\n'
                
            write2fileJSON = self.__destination+modelName+"_results_%s_SW.json"%(numberOfTerms)
            with open(write2fileJSON, 'w') as fp:
                json.dump(self.__overlapingTopics, fp)
     
        except KeyError as e: 
            print e
            pass 
        
        resultsCSV.close()
def get_keywords(threshold=0.01, model_path='result/model.lda'):
    lda_model = LdaModel.load(model_path)
    topic_num = lda_model.num_topics
    keywords = set()
    for topic_id in range(topic_num):
        topic = lda_model.state.get_lambda()[topic_id]
        topic = topic / topic.sum()  # normalize to probability dist
        signif_word_ids = np.where(topic > threshold)[0]
        keywords = keywords.union([lda_model.id2word[word_id] for word_id in signif_word_ids])

    return keywords
Ejemplo n.º 13
0
    def __init__(self):
        # current_working_dir = '/home/etu/eason/nodejs/Semantic_Aware_RecSys'
        current_working_dir = '.'
        os.chdir(current_working_dir)
        lda_model_path = "./LDAmodel/final_ldamodel"

        self.lda = LdaModel.load(lda_model_path)
        self.no_of_recommendation = 10
        self.omit_topic_below_this_fraction = 0.1
        self.mapping = self.__init_mapping()
        self.linkMapping = self.__init_Link_mapping()
        self.doc_topic_matrix = loadPickleFile('doc_topic_matrix')
Ejemplo n.º 14
0
 def getAllTopics(self, modelName='', numberOfTerms=100):
     '''
     modelName -> name of model to read in to memory without the extension
     '''
     
     returningData = {}
     
     if modelName=='':
         modelName=self.__fileName
         
     model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
     
     return model.show_topics(num_topics=model.num_topics,num_words=numberOfTerms, formatted=False)
    def __init__(self, categ, lda_num_topics):
        """
        Initialize Predict class
        """
        collection_name = '%s_corpus' % categ
        dictionary_path = os.path.join(src_dir, 'models/dictionary_' + categ + '.dict')
        lda_model_path = os.path.join(dst_dir, 'models/lda_model_' + str(lda_num_topics) +'_topics_' + categ + '.lda')

        self.categ = categ
        self.collection_name = collection_name
        self.lda_num_topics = lda_num_topics
        self.dictionary = corpora.Dictionary.load(dictionary_path)
        self.lda = LdaModel.load(lda_model_path)
        self.stopwords = stopwords.words('english')
        self.lem = WordNetLemmatizer()
        self.tokenizer = regexp.RegexpTokenizer("[\w’]+", flags=re.UNICODE)
    def worker(its, rows_array, lock, counter, n):
        model = LdaModel.load('tweets_es_25topics.lda')
        dictionary = corpora.Dictionary.load('tweets_es.dict')
        for i, t in its:
            tokens = tokenize(preprocess(t.text))
            doc_bow = dictionary.doc2bow(tokens)
            doc_lda = model[doc_bow]

            feats = doc_lda
            lock.acquire()
            rows_array[i] = feats
            counter.value += 1
            if i % 50 == 0:
                sofar = counter.value
                perc = sofar * 100.0 / n
                print ("%.2f %% so far" % perc )
            lock.release()
def display(categ, lda_num_topics):
    u'''Display hidden topics'''
    lda_model_path = os.path.join(dst_dir, 'models/lda_model_' + str(lda_num_topics) +'_topics_' + categ + '.lda')
    lda = LdaModel.load(lda_model_path)
    top_list = lda.show_topics(num_topics=lda_num_topics, num_words=20, log=False, formatted=True)
    index = 0
    for top in top_list:
        index += 1
        print index,
        #scores = []
        #words = []
        topwords = top.split(' + ')
        for topword in topwords:
            member = topword.split('*')
            print member[1],
            #words.append(member[1])
            #scores.append(member[0])
        print ''
Ejemplo n.º 18
0
    def updateModel_LDA(self, dictname, modelName):
        
        #=======================================================================
        # GET LAST MODIFIED DATE IN MYSQL FORMAT
        #=======================================================================

        
        #=======================================================================
        # GET NEW DOCUMENTS SINCE LAST MODIFIED DATE AND PREPARE THEM
        #=======================================================================
        modelModified = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(os.path.getmtime(self.__destination+modelName+'.lda') ))
        self.getCorpusFromDB(modelModified)
        self.cleanPreparedCorpus()
        
        #=======================================================================
        # UPDATE DICT, CORPUT AND LDA MODEL WITH NEW DOCUMENTS
        #=======================================================================
        oldDict = corpora.Dictionary.load(self.__destination+dictname+'.dict')
        newCorpora = [oldDict.doc2bow(text) for text in self.__cleanedCorpus]
 #==============================================================================
 #        oldContent =  self.__destination+dictname+'.mm'
 #        print oldContent
 #         
 #        oldDict = corpora.Dictionary.load(self.__destination+dictname+'.dict')
 #        newDict = corpora.Dictionary(self.__cleanedCorpus)
 #        mergedDict = oldDict.merge_with(newDict)
 # 
 #        oldCorpora = corpora.MmCorpus(self.__destination+dictname+'.mm')
 #        newCorpora = [newDict.doc2bow(text) for text in self.__cleanedCorpus]
 #        mergedCorpus = itertools.chain(oldCorpora,mergedDict[newCorpora])
 #     
 #        mergedDict.save(self.__destination+dictname+'.dict')
 #        corpora.MmCorpus.serialize(self.__destination+dictname+'.mm', mergedCorpus)
 #==============================================================================

        #=======================================================================
        # dict = corpora.Dictionary.load(self.__destination+modelName+'.dict')
        # mm = corpora.MmCorpus(self.__destination+modelName+'.mm')
        #=======================================================================
        ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r')
        ldaModel.update(newCorpora)
        ldaModel.save(self.__destination+modelName+'.lda')
Ejemplo n.º 19
0
	def __init__(self, ldaModelFile, dictionaryfile, stopfile="english.stop.txt"):
		'''
			Const
			Parameters:
				ldaModelFile: the model file that was trained 
				dictionaryfile: id2word mapping file
		'''
		logging.info("[Start] Loading the dictionary " + dictionaryfile)
		self.id2word = Dictionary.load(dictionaryfile)
		logging.info("[Stop] Loading the dictionary " + dictionaryfile)

		logging.info("[Start] Loading the model file " + ldaModelFile)
		self.ldamodel = LdaModel.load(ldaModelFile)
		logging.info("[Done] Loading the model file " + ldaModelFile)

		logging.info("[Start] Loading all topics")
		self.alltopics = self.ldamodel.show_topics(-1)
		logging.info("[Start] Loading all topics")

		self.stopwords = self.loadStop(stopfile)
Ejemplo n.º 20
0
def run(new_review, rate):
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    en_stop = get_stop_words('en')

    p_stemmer = nltk.PorterStemmer()
    letters_only = nltk.re.sub("[^a-zA-Z]", " ", new_review)
    raw = letters_only.lower()
    tokens = tokenizer.tokenize(raw)

    words = []
    tagged_text = nltk.pos_tag(tokens)
    for word, tag in tagged_text:
        words.append({"word": word, "pos": tag})

    lem = WordNetLemmatizer()
    nouns = []
    for word in words:
        if word["pos"] in ["NN", "NNS"]:
            nouns.append(lem.lemmatize(word["word"]))

    stopped_tokens = [i for i in nouns if not i in en_stop]

    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    dictionary = corpora.Dictionary.load("dictionary.dict")
    lda = LdaModel.load("lda_model_50_topics.lda")

    new_review_bow = dictionary.doc2bow(stemmed_tokens)
    new_review_lda = lda[new_review_bow]

    for i in new_review_lda:
        topic = i[0]
        probability = i[1]
        if topic in topics:
            topicName = topics.get(topic)
            ratingSum[topicName] = ratingSum.get(topicName) + probability * rate
            denominator[topicName] = denominator.get(topicName) + probability
Ejemplo n.º 21
0
import logging

from gensim.models import LdaModel
from gensim import corpora


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary_path = "models/dictionary.dict"
corpus_path = "models/corpus.lda-c"
lda_num_topics = 50
lda_model_path = "models/lda_model_50_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

i = 0
for topic in lda.show_topics(lda_num_topics):
    print '#' + str(i) + ': ' + topic
    i += 1

Ejemplo n.º 22
0
from gensim.models import LdaModel
from gensim import corpora
import nltk
import string
import Config

dictionary = corpora.Dictionary.load(Config.DICTIONARY_LOCAL)
lda = LdaModel.load(Config.LDA_LOCAL)

def clean_review(review):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    wnl = nltk.WordNetLemmatizer()

    counter = 0
    cleaned_review_words = []

    # lower case and remove punctiation
    punctuation = set(string.punctuation)
    review_text = (''.join([c for c in review.lower() if not c in punctuation]))

    sentences = nltk.sent_tokenize(review_text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        for w in words:
            if w not in stopwords:
                # consider tagging POS, but decreases performance drastically
                cleaned_review_words.append(wnl.lemmatize(w))

    return cleaned_review_words

def find_topics_in_review(cleaned_text):
Ejemplo n.º 23
0
def use_lda_model(documents=[],model='',trained_dictionary='',verbose=True):
    global MODEL_FILENAME
    #> Again, extend https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization

    #NOTES:
    #- allow user to pass reusable model/dictionary
    
    
    if not model:
        if not os.path.exists(MODEL_FILENAME):
            print ("Training new model: "+MODEL_FILENAME)
            train_model_basic()
    
        model = LdaModel.load(MODEL_FILENAME)
        
    if not trained_dictionary:
        #Use trained dictionary to lookup bow
        trained_dictionary = corpora.Dictionary.load(TOPIC_DICTIONARY_FILENAME)
    print ("Trained dict length: "+str(len(trained_dictionary)))
    
    if not documents:
        print ("**USING DEFAULT DEMO DOCS...")
        documents=['The world bank and the world bank women economic market']
        documents+= [
                 "System and human system engineering testing of EPS",
                 "Relation of user perceived response time to error measurement",
                 "The generation of random binary unordered trees",
                 ]
    
    #Combine# documents=[" ".join(documents)]
    
    
    
    #1/  Use trained dictionary to lookup bow and return standard text
    documents,other_texts,other_corpus,common_dictionary=get_corpora(documents=documents,common_dictionary=trained_dictionary,verbose=False)

    #2/  Docs vectors to topic vectors
    topic_vectors = model[other_corpus]
    
    #3/  Review documents
    doc_topics=[]
    for i,document in enumerate(documents):
        if verbose:
            print ("Given document: "+str(document))
            print ("Given tokenized: "+str(other_texts[i]))
            print ("Using stemmed: "+str(other_corpus[i]))
        
        #/ transform topic into top list
        topic_matches=[]
        for tnum,percent in topic_vectors[i]:topic_matches+=[(tnum,percent)]
        topic_matches.sort(key=lambda x:x[1],reverse=True)
        
        #/ Iter top topic matches (break at 0)
        tnum=-1;percent=0;topic_lable=''
        for tnum,percent in topic_matches:
            topic_label=describe_topic(model,trained_dictionary,tnum)
            if verbose:
                print ("Topic #"+str(tnum)+" match "+str(percent)+"%  >topic label: "+topic_label)
            doc_topics+=[(tnum,percent,topic_label)]
            break
        if verbose: print

    if 'review_topic_distribution_similarity_content' in '':
        lda_corpus1 = model[corpus1]
        top_dist1, _ = get_doc_topic_dist(model, lda_corpus1)
        #> possible cosine_sim...https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization

    print ("**assumption:  Topic label is first 3 most salient terms of topic cluster.")
    return doc_topics,model,trained_dictionary #For reuse
"""
Computing similarity between each category.
"""
import logging
from gensim.models import LdaModel
from sklearn.metrics.pairwise import cosine_similarity
from utils.util import enpickle

__author__ = 'kensk8er'


if __name__ == '__main__':
    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    logging.info('Loading the model...')
    model = LdaModel.load('result/model_wiki.lda')
    topics = []
    for topic_id in range(model.num_topics):
        topics.append(model.return_topic(topicid=topic_id))

    similarity = cosine_similarity(topics)
    enpickle(similarity, 'result/topic_similarity/lda_wiki.pkl')
from pymongo import MongoClient

__author__ = 'Parry'
from gensim.models import LdaModel
from gensim import corpora
from Constants import Parameters



dictionary = corpora.Dictionary.load(Parameters.Dictionary_path)
corpus = corpora.BleiCorpus(Parameters.Corpus_path)
lda = LdaModel.load(Parameters.Lda_model_path)



corpus_collection = MongoClient(Parameters.MONGO_CONNECTION_STRING)[Parameters.REVIEWS_DATABASE][Parameters.CORPUS_COLLECTION]

i=0
corpus_cursor = corpus_collection.find()
for review in corpus_cursor:
             # assume there's one document per line, tokens separated by whitespace
             i=i+1
             print lda[dictionary.doc2bow(review["words"])]
             if i ==20:
                 break;


#!/usr/bin/env python

from gensim.models import LdaModel
from gensim.corpora import MmCorpus, Dictionary
import sys, os
import pyLDAvis.gensim


if len(sys.argv) < 2:
    print("usage: {0} [path to model.lda]\n".format(sys.argv[0]))
    sys.exit(1)


path, file = os.path.split(sys.argv[1])
corpusname = file.split(".")[0]

dictionary = Dictionary.load(path+"/"+corpusname+".dict")
corpus = MmCorpus(path+"/"+corpusname+".mm")
model = LdaModel.load(sys.argv[1])


##############
# cf. https://pyldavis.readthedocs.org/en/latest/modules/API.html

vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)

pyLDAvis.save_html(vis, path+"/"+corpusname+"_interactive.html")
pyLDAvis.show(vis)
Ejemplo n.º 27
0
########################################################################

doc_labels = []

# get document labels
print("\n get labels \n")
with open(os.path.join(path, "corpus_doclabels.txt"), "r") as f:
    for line in f: doc_labels.append(line)

# load corpus
print("\n load corpus \n")
corpus = MmCorpus(os.path.join(path,"corpus.mm"))

# load model
print("\n load model \n")
model = LdaModel.load(os.path.join(path,"corpus.lda"))

no_of_topics = model.num_topics
no_of_docs = len(doc_labels)


########################################################################
# get doc-topic matrix
########################################################################

doc_topic = np.zeros((no_of_docs, no_of_topics))

for doc, i in zip(corpus, range(no_of_docs)):           
    topic_dist = model.__getitem__(doc)                 # to get topic distribution from model
    for topic in topic_dist:                            # topic_dist is a list of tuples (topic_id, topic_prob)
        doc_topic[i][topic[0]] = topic[1]               # save topic probability
Ejemplo n.º 28
0
def calculate_lda(dataset_raw, n_topics=10, lda_model_name="",
                  mallet=True, mallet_path="/Users/verasazonova/no-backup/JARS/mallet-2.0.7/bin/mallet",
                  dataname="none"):

    with open(dataname+"_log.txt", 'a') as fout:

        if dataset_raw.include_date:
            dates = [text[1] for text in dataset_raw]
            dataset = [normalize_words(text[0].split(), dataset_raw.stoplist) for text in dataset_raw]
        else:
            dates = ["" for _ in dataset_raw]
            dataset = dataset_raw

        bi_grams = Phrases(dataset, threshold=3)
        dataset = bi_grams[dataset]


        dictionary = Dictionary(dataset)
        dictionary.filter_extremes(no_below=1, no_above=0.9)

        bow_corpus = [dictionary.doc2bow(text) for text in dataset]

        fout.write("# Topics: %s\n" % n_topics)

        if not os.path.isfile(lda_model_name):

            if mallet:
                lda_model = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=n_topics, id2word=dictionary, workers=4,
                                     optimize_interval=10, iterations=1000)
                lda_model_name = "lda_model_mallet_%s_%i" % (dataname, n_topics)
            else:
                lda_model = LdaModel(bow_corpus, id2word=dictionary, num_topics=n_topics, distributed=False,
                                    chunksize=2000, passes=5, update_every=10, alpha='asymmetric',
                                    eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001)

                lda_model_name = "lda_model_%s_%i" % (dataname, n_topics)

            lda_model.save(lda_model_name)

        else:
            if mallet:
                lda_model = LdaMallet.load(lda_model_name)
            else:
                lda_model = LdaModel.load(lda_model_name)

        topic_definition = []

        for i, topic in enumerate(lda_model.show_topics(n_topics, num_words=20, formatted=False)):
            fout.write("%i \n" % i)
            topic_list = []
            freq_list = []
            a_list = []
            for tup in topic:
                topic_list.append(tup[1])
                freq_list.append(dictionary.dfs[ dictionary.token2id[tup[1]] ] )
                a_list.append(tup[0])


            fout.write( "%s\n\n" % repr((sorted(zip(topic_list, freq_list), key=itemgetter(1) ))))

            topic_definition.append("%i, %s" %(i, repr(" ".join(sorted(topic_list)))[2:-1]))

        fout.write("Total number of documents: %i\n" % dictionary.num_docs )



        earliest_date = dateutil.parser.parse("Sun Jun 08 00:00:00 +0000 2014")

        a = [tup for tup in  sorted(zip(bow_corpus, dates), key=get_date )
             if dateutil.parser.parse(tup[1]) > earliest_date]

        print len(a)
        print a[len(a)-1]
        latest_date = dateutil.parser.parse(a[len(a)-1][1])

        num_bins = 100

        time_span = latest_date - earliest_date
        print time_span
        time_bin = time_span / num_bins
        print time_bin

        bin_lows = [earliest_date]
        bin_high = earliest_date + time_bin
        counts = [[0 for _ in range(n_topics)] for _ in range(num_bins+1)]
        i=0
        for text in a:
            topic_assignments = lda_model[text[0]]
            date_str = text[1]
            if date_str is not None:
                cur_date = dateutil.parser.parse(date_str)
                if cur_date >= bin_high:
                    i+=1
                    bin_lows.append(bin_high)
                    bin_high = bin_lows[len(bin_lows)-1] + time_bin
                #counts[i][max(topic_assignments, key=itemgetter(1))[0]] += 1
                for tup in topic_assignments:
                    counts[i][tup[0]] += tup[1]

        fout.write("Number of documents assigned mostly to the topic: \n")
        fout.write("%s\n" % counts)

        a = 1.*np.array(counts)

        np.savetxt("mpeketoni_cnts.txt", a)
        with open("mpeketoni_bins.txt", 'w') as fout:
            for date in bin_lows:
                fout.write("%s\n" % date)
        with open("mpeketoni_labels.txt", 'w') as fout:
            for label in topic_definition:
                fout.write("%s\n" % label)

        return a, bin_lows, topic_definition
Ejemplo n.º 29
0
from TechDashAPI.mysqlUtilities import connectMySQL
from TechDashAPI.ContentExtractor import ContentExtractor
from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer
from TechDashAPI.createDOM import createDom
from TechDashAPI.util import utilities
from TechDashAPI.topicModeling import techDashTopicModel

from gensim.models import LdaModel

db = connectMySQL(db='xpath', port=3366)
filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'
utilitiesFunctions = utilities()

modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/'
modelName ='fullModel_100P_20T'
model = LdaModel.load(modelDestination+modelName+'.lda',  mmap=None)
topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T')

#===============================================================================
# UPDATE ALL ARTICLES TO NEW TOPICS
#===============================================================================

sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """

db.executeQuery(sqlQuery)

for item in db._connectMySQL__results:
    #===========================================================================
    # print item
    #===========================================================================
    topicModelCat = topicModel.getDocumentTopics(item[1])
Ejemplo n.º 30
0
 def __init__(self):
     print "Loading LDA model."
     self.model = LdaModel.load('big_lda_model', mmap=None)
     self.num_features = 100
     self.dictionary = Dictionary.load('big_wiki_subset_dict.dict')
     print "LDA model loaded."
X = vectorizer.fit_transform(res)
vocab = vectorizer.get_feature_names()
start_time = time.time()
model = LdaMulticore(
                    matutils.Sparse2Corpus(X,documents_columns=False), 
                    num_topics=9,passes=10,
                    chunksize=5000,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                    workers=7,
                    )
print("--- %s seconds ---" % (time.time() - start_time))
fname = '/Users/royyang/Desktop/trending_project/re_categorization_ls/LDA_9topics'
model.save(fname)

#Load a pretrained model
model = LdaModel.load(fname, mmap='r')
type(model)

#==============================================================================
# # Get all topics from training 
# topic_number, number_of_aritcles, top_words
#==============================================================================
def get_topic(n):
    doc_lda = model[doc_list[n]]    
    current_prob = 0
    for var in doc_lda:
        if var[1]>current_prob:
            current_prob = var[1]
            topic_num = var[0]
    return topic_num,re.sub('[+.0123456789\*]','',topic[topic_num])
Ejemplo n.º 32
0
 def __init__(self):
     dictionary_path = "topics_labels/models/dictionary.dict"
     lda_model_path = "topics_labels/models/lda_model_50_topics.lda"
     self.dictionary = corpora.Dictionary.load(dictionary_path)
     self.lda = LdaModel.load(lda_model_path)
                        passes=passes,
                        eval_every=eval_every)

# Calculate coherence
coherence_model_lda = CoherenceModel(model=reasonsModel,
                                     texts=reasonsBigram,
                                     dictionary=reasonsDictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Num Topics: ', num_topics, 'coherence is: ', coherence_lda)

reasonsModel.save(
    'C:/Users/matth/Documents/GitHub/Survey-Nonresponders/Data/reasonsModel_save'
)
reasonsModel = LdaModel.load(
    'C:/Users/matth/Documents/GitHub/Survey-Nonresponders/Data/reasonsModel_save'
)

for idx, topic in reasonsModel.print_topics(-1, 50):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

################################################################################################
#Now run the suggestions with one of the identified "sweet spots" and then get topic words     #
################################################################################################

chunksize = 1541
passes = 1000
iterations = 100000
eval_every = None  # evaluate perplexity
num_topics = 8