Python LdaModel Examples, gensim.models.LdaModel Python Examples

Example #1

1

Show file

File: main.py Project: cscorley/mud2014-modeling-changeset-topics

def create_evaluation_perplexity(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    try:
        id2word = Dictionary.load(corpus_fname + '.dict')
        corpus = MalletCorpus(corpus_fname, id2word=id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    held_out = list()
    training = list()
    target_len = int(0.1 * len(corpus))
    logger.info('Calculating perplexity with held-out %d of %d documents' %
                (target_len, len(corpus)))

    ids = set()
    while len(ids) < target_len:
        ids.add(random.randint(0, len(corpus)))

    for doc_id, doc in enumerate(corpus):
        if doc_id in ids:
            held_out.append(doc)
        else:
            training.append(doc)

    model = LdaModel(training,
                     id2word=corpus.id2word,
                     alpha=config.alpha,
                     passes=config.passes,
                     num_topics=config.num_topics)

    pwb = model.log_perplexity(held_out)

    with open(config.path + 'evaluate-perplexity-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, pwb])

Example #2

0

Show file

File: test_lda_callback.py Project: RaRe-Technologies/gensim

class TestLdaCallback(unittest.TestCase):

    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097

    def testCallbackUpdateGraph(self):

        # Popen have no context-manager in 2.7, for this reason - try/finally.
        try:
            # spawn visdom.server
            proc = subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)])

            # wait for visdom server startup (any better way?)
            time.sleep(3)

            viz = Visdom(server=self.host, port=self.port)
            assert viz.check_connection()

            # clear screen
            viz.close()

            self.model.update(self.corpus)
        finally:
            proc.kill()

Example #3

0

Show file

File: main.py Project: cscorley/changeset-feature-location

def create_lda_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lda.gz'


    if not os.path.exists(model_fname) or force:
        if corpus:
            update_every=None # run in batch if we have a pre-supplied corpus
        else:
            update_every=1

        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         alpha=project.alpha,
                         eta=project.eta,
                         passes=project.passes,
                         num_topics=project.num_topics,
                         iterations=project.iterations,
                         eval_every=None, # disable perplexity tests for speed
                         update_every=update_every,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LdaModel.load(model_fname)

    return model, model_fname

Example #4

0

Show file

File: LDA.py Project: kensk8er/MsTweetAnalysis

def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200):
    """


    :param dictionary:
    :param corpus:
    :param wiki_path:
    :param num_topics:
    :param passes:
    :param iterations:
    :param chunksize:
    :return:
    """
    if wiki_path is not None:
        logging.info('Generating wiki corpus...')
        wikis = unpickle(wiki_path)
        wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis]

        logging.info('Combining original corpus and wiki corpus...')
        corpus = corpus + wiki_corpus  # wiki_corpus is merged after the original corpus

    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes,
                         iterations=iterations, alpha='auto', chunksize=chunksize)
    corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id)
    # doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids]
    doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids]
    doc_vectors = lda_model.inference(corpus)[0]
    doc_vectors = doc_vectors[corpus_ids, :]
    doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1)

    return lda_model, doc_vectors, doc_vector_ids

Example #5

0

Show file

File: cluster.py Project: fanfannothing/xtas

def lda(docs, k):
    """Latent Dirichlet allocation topic model.

    Uses Gensim's LdaModel after tokenizing using scikit-learn's
    TfidfVectorizer.

    Parameters
    ----------
    k : integer
        Number of topics.
    """
    from gensim.matutils import Sparse2Corpus
    from gensim.models import LdaModel

    # Use a scikit-learn vectorizer rather than Gensim's equivalent
    # for speed and consistency with LSA and k-means.
    vect = _vectorizer()
    corpus = vect.fit_transform(fetch(d) for d in docs)
    corpus = Sparse2Corpus(corpus)

    model = LdaModel(corpus=corpus, num_topics=k)

    topics = model.show_topics(formatted=False)
    vocab = vect.get_feature_names()
    #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic]
    return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]

Example #6

0

Show file

File: test_tmdiff.py Project: RaRe-Technologies/gensim

class TestLdaDiff(unittest.TestCase):
    def setUp(self):
        self.dictionary = common_dictionary
        self.corpus = common_corpus
        self.num_topics = 5
        self.n_ann_terms = 10
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10)

    def testBasic(self):
        # test for matrix case
        mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms)

        self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
        self.assertEqual(len(annotation), self.num_topics)
        self.assertEqual(len(annotation[0]), self.num_topics)

        # test for diagonal case
        mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True)

        self.assertEqual(mdiff.shape, (self.num_topics,))
        self.assertEqual(len(annotation), self.num_topics)

    def testIdentity(self):
        for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
            # test for matrix case
            mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name)

            for row in annotation:
                for (int_tokens, diff_tokens) in row:
                    self.assertEqual(diff_tokens, [])
                    self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

            # test for diagonal case
            mdiff, annotation = \
                self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True)

            for (int_tokens, diff_tokens) in annotation:
                self.assertEqual(diff_tokens, [])
                self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

    def testInput(self):
        self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something')
        self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')

Example #7

0

Show file

File: task2.py Project: TigerDeng/exercises

def extract_topics(words):
    word_id_map=Dictionary([words])
    word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2])
    word_id_map.compactify()
    deals_corpus=[word_id_map.doc2bow(words)]
    lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1)
    topics=[]
    for i in range(15):
        tokens=lda.print_topic(i).split('+')
        topic_scores=[]
        for token in tokens:
            score,token_val=token.split('*')
            topic_scores.append((token_val,score))
        topics.append(topic_scores)
    return topics

Example #8

0

Show file

File: topicModeling.py Project: deakkon/TechDashboard

    def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''):
        '''
        Constructor
        '''
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        self.__destination = destination
        self.__fileName = fileName
        self.__modelName = modelName
        self.__ldaPasses = ldaPasses
        self.__topicNum = topicNum
                
        #=======================================================================
        # STOP WORDS AND CAHRACTERS
        #=======================================================================
        self.__stopwords = stopwords.words('english')# + string.punctuation
        self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u'']
        self.__stopwords.extend(self.__chars_to_remove)
        self.__stopwords.extend([item for item in string.punctuation])

        #=======================================================================
        # DATABASE
        #=======================================================================
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__queryResults = None
        self.__cleanedCorpus = []
        

        if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'):
            self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') 
            
        if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'):
            self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')

Example #9

0

Show file

File: printResults.py Project: kensk8er/MsTweetAnalysis

def write_topics(model_path, csv_name, k):
    model = LdaModel.load(model_path)
    topics = []
    for topic_id in range(model.num_topics):
        topics.append(model.return_topic(topicid=topic_id))

    dictionary = Dictionary.load('data/dictionary/tweets.dict')
    word_indices = dictionary.id2token
    writer = csv.writer(file(csv_name, 'w'))

    output = [[0 for i in range(model.num_topics)] for j in range(k)]
    for topic_id, topic in enumerate(topics):
        for rank, index in enumerate(topic.argsort()[::-1]):
            output[rank][topic_id] = {}
            output[rank][topic_id]['word'] = word_indices[index]
            output[rank][topic_id]['p'] = topic[index]
            rank += 1
            if rank >= k:
                break

    for topic_id in range(model.num_topics):
        row = ['z = ' + str(topic_id)]

        for rank in range(k):
            row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p']))

        writer.writerow(row)

Example #10

0

Show file

File: dmp.py Project: npiaq/dmp

 def load(self):
     '''读取 lda 模型和 dic 词典.
     '''
     lda_file = config.get('dmp', 'lda_file')
     dic_file = config.get('dmp', 'dic_file')
     self.lda = LdaModel.load(lda_file)
     self.dic = Dictionary.load(dic_file)

Example #11

0

Show file

File: main.py Project: cscorley/mud2014-modeling-changeset-topics

def create_evaluation_distinctiveness(config, Kind):
    model_fname = config.model_fname % Kind.__name__

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    scores = utils.score(model, utils.kullback_leibler_divergence)
    total = sum([x[1] for x in scores])

    logger.info("%s model KL: %f" % (model_fname, total))
    with open(config.path + 'evaluate-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, total])

    etas = list()
    for topic in model.state.get_lambda():
        topic_eta = list()
        for p_w in topic:
            topic_eta.append(p_w * numpy.log2(p_w))
            etas.append(-sum(topic_eta))

    entropy = sum(etas) / len(etas)

    logger.info("%s model entropy mean: %f" % (model_fname, entropy))
    with open(config.path + 'evaluate-entropy-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, entropy])

Example #12

0

Show file

File: ldamodel.py Project: stephenhky/PyBibleNLP2

class CorpusLdaModelWrapper:
    def __init__(self, corpus, dictionary, doc_labels, preprocessing_pipeline, numtopics):
        self.corpus = corpus
        self.dictionary = dictionary
        self.doc_labels = doc_labels
        self.pipeline = preprocessing_pipeline
        self.numtopics = numtopics
        self.trained = False

    def train(self):
        # training
        self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics)
        self.index = MatrixSimilarity(self.model[self.corpus])

        # flag
        self.trained = True

    def convertTextToReducedVector(self, text):
        if not self.trained:
            raise exceptions.ModelNotTrainedException()
        tokens = word_tokenize(prep.preprocess_text(text, self.pipeline))
        tokens = filter(lambda token: self.dictionary.token2id.has_key(token), tokens)
        bow = self.dictionary.doc2bow(tokens)
        return self.model[bow]

    def queryDoc(self, text):
        reducedVec = self.convertTextToReducedVector(text)
        sims = self.index[reducedVec]
        simtuples = zip(range(len(sims)), sims) if self.doc_labels==None else zip(self.doc_labels, sims)
        simtuples = sorted(simtuples, key=lambda item: item[1], reverse=True)
        return simtuples

    def show_topic(self, id):
        return self.model.show_topic(id)

Example #13

0

Show file

File: ldamodel.py Project: stephenhky/PyBibleNLP2

    def train(self):
        # training
        self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics)
        self.index = MatrixSimilarity(self.model[self.corpus])

        # flag
        self.trained = True

Example #14

0

Show file

File: topicModeling.py Project: deakkon/TechDashboard

    def calculateLDADistance(self, modelName='', topNSimilar='', topicList=''):
        
        if modelName=='':
            modelName=self.__fileName
    
        if topNSimilar=='':
            topNSimilar=5       
            
        write2file = self.__destination+modelName+"_results_LDA_similarTopics.csv"
        resultsCSV = open(write2file, "wb")
        
        print 'Reading model data'
        gensimDict = corpora.Dictionary.load(self.__destination+self.__fileName+'.dict')
        ldaModel = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
        topics = ldaModel.show_topics(num_topics=ldaModel.num_topics, num_words=len(gensimDict),formatted=False)
        #=======================================================================
        # num_topics=ldaModel.num_topics                             
        # num_words=len(gensimDict)
        #=======================================================================
        
        #=======================================================================
        # GET SIMILARITY VECTORS
        #=======================================================================
        print 'Extractig vectors'
        topicsSorted = [sorted(x,  key=lambda x: x[1]) for x in topics]
        vectors = []
            
        for topic in topicsSorted:
            vector = [item[0] for item in topic]
            vectors.append(vector)

        #=======================================================================    
        # CALCULATE SIMILARITIES BETWEEN TOPICS
        #=======================================================================
        print 'Calculating distances between LDA topics\n'
        results = []
        for topicListItem in topicList:
            distances = []
            for j in range (0, len(vectors)):
                dist = euclidean(vectors[topicListItem], vectors[j])
                #===============================================================
                # print topicListItem, j, dist
                #===============================================================
                distances.append(dist)
            results.append(distances)

        #=======================================================================
        # EXPORT TOP N SIMILAR TOPICS NAD PRINT OUT QUERY TERMS
        #=======================================================================
        print 'Writing found similar topics to file\n'
        for resultItem in range(0,len(results)):
            similarLDATopics = np.argsort(results[resultItem])[::-1]
              
            for similarItem in similarLDATopics[:topNSimilar]:
                #===============================================================
                # print topicList[resultItem],similarItem
                #===============================================================
                resultsCSV.write(str(topicList[resultItem])+'; '+str(similarItem)+'; '+', '.join(x[1].lstrip().rstrip() for x in topics[similarItem][:100])+'\n\n')
            resultsCSV.write('*******************************************\n\n')

Example #15

0

Show file

File: main.py Project: cscorley/mud2014-modeling-changeset-topics

def evaluate_log(context, config):
    logger.info('Evalutating models for: %s' % config.project.name)

    model_fname = config.model_fname % ChangesetCorpus.__name__
    changeset_fname = config.corpus_fname % ChangesetCorpus.__name__
    commit_fname = config.corpus_fname % CommitLogCorpus.__name__

    try:
        commit_id2word = Dictionary.load(commit_fname + '.dict')
        commit_corpus = MalletCorpus(commit_fname,
                                     id2word=commit_id2word)
        changeset_id2word = Dictionary.load(changeset_fname + '.dict')
        changeset_corpus = MalletCorpus(changeset_fname,
                                        id2word=changeset_id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    changeset_doc_topic = get_doc_topic(changeset_corpus, model)
    commit_doc_topic = get_doc_topic(commit_corpus, model)

    first_shared = dict()
    for id_ in commit_doc_topic:
        i = 0
        commit_topics = [topic[0] for topic in commit_doc_topic[id_]]
        try:
            changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]]
        except:
            continue

        maximum = 101
        minimum = maximum

        for i, topic in enumerate(commit_topics):
            if topic in changeset_topics:
                j = changeset_topics.index(topic)
                minimum = min(minimum, max(i, j))

        for i, topic in enumerate(changeset_topics):
            if topic in commit_topics:
                j = commit_topics.index(topic)
                minimum = min(minimum, max(i, j))

        first_shared[id_] = minimum

        if minimum == maximum:
            logger.info('No common topics found for %s' % str(id_))
            del first_shared[id_]

    mean = sum(first_shared.values()) / len(first_shared)

    with open('data/evaluate-log-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, mean] + list(first_shared.values()))

Example #16

0

Show file

File: predict.py Project: nkman/Raiden

  def __init__(self):

    cwd = os.path.dirname(__file__)
    dictionary_path = os.path.abspath(os.path.join(cwd, 'models/dictionary.dict'))
    lda_model_path = os.path.abspath(os.path.join(cwd, 'models/lda_model_10_topics.lda'))

    self.dictionary = corpora.Dictionary.load(dictionary_path)
    self.lda = LdaModel.load(lda_model_path)

Example #17

0

Show file

File: lda_models.py Project: verasazonova/tweet_mining

def load_lda_model(lda_model_name=None, mallet=False):
    if os.path.isfile(lda_model_name):
        if mallet:
            lda_model = LdaMallet.load(lda_model_name)
        else:
            lda_model = LdaModel.load(lda_model_name)
        return lda_model
    return None

Example #18

0

Show file

File: topic_modeling.py Project: AdrienGuille/EGC-Cup-2016

 def topicsLDA(self, num_topics=10, num_iterations=10000, num_words=10):
     # LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001)
     try:
         lda = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=num_iterations)
         result = {}
         tpd = lda[self.corpus] # topic probability distribution
         for topics in tpd:
             for elem in topics:
                 if result.get(elem[0], -1) == -1:
                     words = lda.show_topic(elem[0], topn=num_words)
                     result[elem[0]] = {'weight': elem[1], 'words': words}
                 else:
                     result[elem[0]]['weight'] += elem[1]
         return result
     except Exception as e:
         print e
         return None

Example #19

0

Show file

File: test_lda_callback.py Project: RaRe-Technologies/gensim

    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097

Example #20

0

Show file

File: topicModeling.py Project: deakkon/TechDashboard

    def analyzeLDA(self, modelName='', numberOfTerms=''):
        '''
        modelName -> name of model to read in to memory without the extension
        '''
        
        if modelName=='':
            modelName=self.__fileName
            
        if numberOfTerms == '':
            numberOfTerms=100
            
        write2file = self.__destination+modelName+"_results_%s_SW.csv"%(numberOfTerms)
        #=======================================================================
        # allTopicsFile = self.__destination+modelName+"_results_AllTopics.csv"
        #=======================================================================
        
        resultsCSV = open(write2file, "wb")
        model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
         
        #and another way, only prints top words 
        for t in range(0, model.num_topics-1):
            #===================================================================
            # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, numberOfTerms)])
            #===================================================================

            topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords]
            listSet = set(topicSet)

            for key in self.__queryWords:  
                difference = set(topicSet).intersection(self.__queryWords[key])
                 
                if len(difference) > 0:
                    self.__overlapingTopics[key][t]=topicSet
        
        try:
            for key in self.__overlapingTopics:
                if self.__overlapingTopics[key]:
                    for topicKey in self.__overlapingTopics[key]:
                        topicTerms = [w.lstrip().rstrip() for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords][:100]
                        #=======================================================
                        # topicTerms = [w.translate(None, ''.join(self.__chars_to_remove)) for w in topicTerms if w !='']
                        #=======================================================
                        resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(topicTerms)+'\n\n')
                        print key,'\t',topicKey,'\t', topicTerms
                    resultsCSV.write('***************************************\n')
                print '*************************\n'
                
            write2fileJSON = self.__destination+modelName+"_results_%s_SW.json"%(numberOfTerms)
            with open(write2fileJSON, 'w') as fp:
                json.dump(self.__overlapingTopics, fp)
     
        except KeyError as e: 
            print e
            pass 
        
        resultsCSV.close()

Example #21

0

Show file

File: topicModeling.py Project: deakkon/TechDashboard

    def analyzeUniqueLDA(self, modelName='', numberOfTerms=''):
        '''
        modelName -> name of model to read in to memory without the extension
        '''
        
        if modelName=='':
            modelName=self.__fileName
            
        if numberOfTerms=='':
            numberOfTerms=100
            
        write2File = self.__destination+modelName+"_results_unique_%sTerms.csv"%(numberOfTerms)
        resultsCSV = open(write2File, "wb")
        
        model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)

         
        #and another way, only prints top words 
        for t in range(0, model.num_topics-1):
            #===================================================================
            # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, 500)])
            #===================================================================
            # raw_input('prompt')
            topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords]
            #===================================================================
            # print type(topicSet), topicSet
            #===================================================================
            listSet = set(topicSet)
            #print listSet
            #print type(topicSet), topicSet
            for key in self.__queryWords:  
                #print self.__queryWords[key]
                difference = set(topicSet).intersection(self.__queryWords[key])
                 
                if len(difference) > 0:
                    self.__overlapingTopics[key][t]=topicSet
        
        try:
            for key in self.__overlapingTopics:
                uniqueQueryTerms = []
                if self.__overlapingTopics[key]:
                    for topicKey in self.__overlapingTopics[key]:
                        topicTerms = [w for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords]
                        uniqueQueryTerms.extend(topicTerms)
                        
                uniqueQueryTerms = [x for x in set(uniqueQueryTerms)]
                resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(uniqueQueryTerms)+'\n\n')
                resultsCSV.write('***************************************\n')
                print key, uniqueQueryTerms
                print '*************************\n'

        except KeyError as e: 
            print e
            pass 
        
        resultsCSV.close()

Example #22

0

Show file

File: test_tmdiff.py Project: JKamlah/gensim

class TestLdaDiff(unittest.TestCase):
    def setUp(self):
        texts = [
            ['human', 'interface', 'computer'],
            ['survey', 'user', 'computer', 'system', 'response', 'time'],
            ['eps', 'user', 'interface', 'system'],
            ['system', 'human', 'system', 'eps'],
            ['user', 'response', 'time'],
            ['trees'],
            ['graph', 'trees'],
            ['graph', 'minors', 'trees'],
            ['graph', 'minors', 'survey'],
        ]
        self.dictionary = Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        self.num_topics = 5
        self.n_ann_terms = 10
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10)

    def testBasic(self):
        mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms)

        self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
        self.assertEquals(len(annotation), self.num_topics)
        self.assertEquals(len(annotation[0]), self.num_topics)

    def testIdentity(self):
        for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
            mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name)

            for row in annotation:
                for (int_tokens, diff_tokens) in row:
                    self.assertEquals(diff_tokens, [])
                    self.assertEquals(len(int_tokens), self.n_ann_terms)

            self.assertTrue(np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

    def testInput(self):
        self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something')
        self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')

Example #23

0

Show file

File: task2.py Project: ypandit/exercises

def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics

Example #24

0

Show file

File: main.py Project: cscorley/mud2014-modeling-changeset-topics

def create_model(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    if not os.path.exists(model_fname):
        try:
            id2word = Dictionary.load(corpus_fname + '.dict')
            corpus = MalletCorpus(corpus_fname, id2word=id2word)
            logger.info('Opened previously created corpus: %s' % corpus_fname)
        except:
            error('Corpora for building file models not found!')

        file_model = LdaModel(corpus,
                              id2word=corpus.id2word,
                              alpha=config.alpha,
                              passes=config.passes,
                              num_topics=config.num_topics)

        file_model.save(model_fname)

Example #25

0

Show file

File: topics.py Project: andresportocarrero/FinancialReportMining

def get_keywords(threshold=0.01, model_path='result/model.lda'):
    lda_model = LdaModel.load(model_path)
    topic_num = lda_model.num_topics
    keywords = set()
    for topic_id in range(topic_num):
        topic = lda_model.state.get_lambda()[topic_id]
        topic = topic / topic.sum()  # normalize to probability dist
        signif_word_ids = np.where(topic > threshold)[0]
        keywords = keywords.union([lda_model.id2word[word_id] for word_id in signif_word_ids])

    return keywords

Example #26

0

Show file

File: dmp.py Project: npiaq/dmp

    def train(self):
        '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象.

        dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词
        lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过
             lda.print_topic(id) 来获取主题中词的列表
        '''
        docs = self.__load_corpus()
        self.dic = Dictionary(docs)
        bow = [self.dic.doc2bow(doc) for doc in docs]
        self.lda = LdaModel(bow, id2word=self.dic,
                            num_topics=self.topic_num)

Example #27

0

Show file

File: Recommend.py Project: divanerd/LDA_RecEngine

    def __init__(self):
        # current_working_dir = '/home/etu/eason/nodejs/Semantic_Aware_RecSys'
        current_working_dir = '.'
        os.chdir(current_working_dir)
        lda_model_path = "./LDAmodel/final_ldamodel"

        self.lda = LdaModel.load(lda_model_path)
        self.no_of_recommendation = 10
        self.omit_topic_below_this_fraction = 0.1
        self.mapping = self.__init_mapping()
        self.linkMapping = self.__init_Link_mapping()
        self.doc_topic_matrix = loadPickleFile('doc_topic_matrix')

Example #28

0

Show file

File: topicModeling.py Project: deakkon/TechDashboard

 def getAllTopics(self, modelName='', numberOfTerms=100):
     '''
     modelName -> name of model to read in to memory without the extension
     '''
     
     returningData = {}
     
     if modelName=='':
         modelName=self.__fileName
         
     model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
     
     return model.show_topics(num_topics=model.num_topics,num_words=numberOfTerms, formatted=False)

Example #29

0

Show file

File: helper.py Project: nwngeek212/NaturalLanguageProcessing

def get_topics(cv, train_data):
    """
    Uses gensim to perform topic modeling.

    Parameters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.

    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    """

    td_gensim = Sparse2Corpus(train_data, documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)

    lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda.top_topics(corpus=td_gensim, num_words=5)

    return topics

Example #30

0

Show file

File: lda.py Project: Sanqiang/entity2vector

    def train(self):
        data = []
        entity2id = {}
        id2entity = []

        for obj in self.data:
            doc = []
            obj_sents = obj["text_data"]
            entity = obj["prod"]
            if entity not in entity2id:
                entity2id[entity] = len(entity2id)
                id2entity.append(entity)
            doc_id = entity2id[entity]

            for obj_sent in obj_sents:
                for pair in obj_sent:
                    if pair[0] >= 0:
                        doc.append((pair[0], doc_id))
            data.append(doc)



        self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic)

        f_entity = open("lda/prod.txt", "w")
        f_model = open("lda/model.txt", "w")
        f_model.write(str(len(entity2id)))
        f_model.write(" ")
        f_model.write(str(self.n_topic))
        f_model.write("\n")

        for entity in id2entity:
            f_entity.write(entity)
            f_entity.write("\n")

            f_model.write(entity)
            f_model.write(" ")

            distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0)
            distr = [pair[1] for pair in distr]

            for prod in distr:
                f_model.write(str(prod))
                f_model.write(" ")

            f_model.write("\n")

        self.ldamodel.save("lda/model_200")

Example #31

0

Show file

 def LDALoad(self):
     self.ldamodel = LdaModel.load("fixed_time_window_lda.model")
     self.dictionary = Dictionary.load("lda_dictionary.model")
     print(self.dictionary)

Example #32

0

Show file

from gensim import models

train = []
stopwords = codecs.open('../../corpus/English_StopWords.txt',
                        'r',
                        encoding='utf8').readlines()
stopwords = [w.strip() for w in stopwords]
fp = codecs.open('../../corpus/test.lsnp', 'r', encoding='utf8')
for line in fp:
    line = line.split()
    train.append([w for w in line if w not in stopwords])
print(train)
dictionary = Dictionary(train)
corpus = [dictionary.doc2bow(text) for text in train]
print(corpus[0])
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
# 打印前20个topic的词分布
print(lda.print_topics(20))
# 打印id为20的topic的词分布
print(lda.print_topic(20))
#模型的保存/ 加载
lda.save('zhwiki_lda.model')
lda = models.ldamodel.LdaModel.load('zhwiki_lda.model')

# tt = 'loss of energy , motivation and no interest in work anymore - be it time to through it all in'
#
# test_doc = list(i for i in tt.split())
#
# doc_bow = id2word.doc2bow(test_doc)      #文档转换成bow
# doc_lda = lda[doc_bow]                   #得到新文档的主题分布
# #输出新文档的主题分布

Example #33

0

Show file

def gensim_lda_topic_modelling(path,
                               documents,
                               num_of_topics=6,
                               passes=50,
                               verbose=True,
                               plotTopicsResults=True):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    if verbose:
        print("Cleaned documents:\n", documents)
        print("\nDictionary:\n", dictionary)
        print("\nCorpus in BoW form: \n", corpus)
    start = time.time()
    ldamodel = LdaModel(corpus=corpus,
                        num_topics=num_of_topics,
                        passes=passes,
                        id2word=dictionary)
    end = time.time()
    print("Completion time for building LDA model: %.3f s = %.3f min" %
          ((end - start), (end - start) / 60.0))

    ldatopics = ldamodel.show_topics(formatted=False)
    ldatopics_words = [[[word, prob] for word, prob in topic]
                       for topicid, topic in ldatopics]

    if verbose:
        print("\nList of words associated with each topic:\n")
        for i in range(len(ldatopics_words)):
            print("\nTopic %d:\n" % i)
            for w, p in ldatopics_words[i]:
                print(p, " - ", w)

    if plotTopicsResults:
        plot_top_10_words_per_topic(path,
                                    ldatopics_words,
                                    num_topics=6,
                                    num_top_words=10)

    all_documents_topics = [
        (doc_topics, word_topics, word_phis)
        for doc_topics, word_topics, word_phis in ldamodel.get_document_topics(
            corpus, per_word_topics=True)
    ]
    all_doc_topics = []
    for i in range(len(all_documents_topics)):
        doc_topics, word_topics, phi_values = all_documents_topics[i]
        all_doc_topics.append(
            [doc_topics[i][1] for i in range(len(doc_topics))])
        if verbose:
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', phi_values)
            print('-------------- \n')

    if plotTopicsResults:
        plot_share_of_topics(path, all_doc_topics, no_random_tweets=10)

    # Plot words coloured differently depending on the topic
    for doc in documents[0:100]:
        if len(doc) > 4:
            color_words(ldamodel, doc)

Example #34

0

Show file

    # we add some words to the stop word list
    texts, article = [], []
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article!
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I':
            # we add the lematized version of the word
            article.append(w.lemma_)
        # if it's a new line, it means we're onto our next document
        if w.text == '\n':
            texts.append(article)
            article = []
    # for i in texts:
    #     print(i)

    bigram = gensim.models.Phrases(texts)
    texts = [bigram[line] for line in texts]
    # print(texts)
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    if len(corpus) == 0:
        print("fadssf")
    else:
        ldamodel = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary)

        for i in ldamodel.show_topics():
            ans = ' '.join(i[1].split(" + "))
            # ans=' '.join(ans.split('*'))
            ans = ''.join(ans.split('"'))

            print(a + "~" + ans)

Example #35

0

Show file

File: lda_tagger.py Project: rajasoun/nlp

class LDATagger:
    _lda_model = None
    _dictionary = None
    _lda_model_path = None
    _dictionary_path = None
    DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model")
    DEFAULT_NUM_TOPICS = 1000

    def __init__(self,
                 model_path=DEFAULT_MODEL_PATH,
                 num_topics=DEFAULT_NUM_TOPICS,
                 lock=threading.Lock()):
        self.save_model_lock = lock

        if os.path.isfile(model_path):
            raise Exception("Invalid Model Path; Should Be a Directory")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        self._lda_model_path = os.path.join(model_path, "lda.model")
        self._dictionary_path = os.path.join(model_path, "tokens.dict")
        self.num_topics = num_topics
        self.model_folder_lock = FileLock(model_path)

    def topics_for_documents(self, doc_tokens_map):
        self.check_and_load_model()
        doc_topics_map = defaultdict(list)
        for document_id, document_tokens in doc_tokens_map.iteritems():
            doc_topics_map[document_id] = self.topics_for_document(
                document_tokens)
        return doc_topics_map

    def topics_for_document(self, tokens):
        self.check_and_load_model()
        bow_tokens = self._dictionary.doc2bow(tokens)
        topics = self._lda_model[bow_tokens]
        return topics

    def build_topics(self, tokens_list):
        self._dictionary = Dictionary(tokens_list)
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model = LdaModel(corpus=corpus,
                                   id2word=self._dictionary,
                                   num_topics=self.num_topics,
                                   passes=100)
        self.save_model()

    def save_model(self, sleep_for_test=False, mock_datastruct=None):
        self.save_model_lock.acquire()
        self.model_folder_lock.acquire()
        if mock_datastruct: mock_datastruct.acquire()
        if sleep_for_test:
            import time
            time.sleep(1)
        print
        "Acquired Lock "
        try:
            self._lda_model.save(self._lda_model_path)
            self._dictionary.save(self._dictionary_path)
        finally:
            print
            "Released Lock"
            if mock_datastruct: mock_datastruct.release()
            self.model_folder_lock.release()
            self.save_model_lock.release()

    def check_and_load_model(self):
        if self._lda_model and self._dictionary:
            return
        if os.path.exists(self._lda_model_path):
            self._lda_model = LdaModel.load(self._lda_model_path)
        else:
            raise Exception("LDA Model Not found in the path")
        if os.path.exists(self._dictionary_path):
            self._dictionary = Dictionary.load(self._dictionary_path)
        else:
            raise Exception("Tokens Dictionary Not found in the path")

    def update_model(self, tokens_list):
        self.check_and_load_model()
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model.update(corpus=corpus)
        self.save_model()

    def build_or_update_model(self, tokens_list):
        if not self.does_model_exist():
            self.build_topics(tokens_list)
        else:
            self.update_model(tokens_list)

    def does_model_exist(self):
        if os.path.exists(self._lda_model_path) and os.path.exists(
                self._dictionary_path):
            return True
        return False

    def get_model(self):
        self.check_and_load_model()
        model_hash = {
            "lda_model": cPickle.dumps(self._lda_model),
            "dictionary": cPickle.dumps(self._dictionary)
        }
        return model_hash

    def restore_model(self, model_hash):
        self._lda_model = cPickle.loads(
            model_hash["lda_model"].encode('utf-8'))
        self._dictionary = cPickle.loads(
            model_hash["dictionary"].encode('utf-8'))
        self.save_model()

    def topics_to_tokens(self):
        topics_tokens_map = defaultdict(list)
        if not self.does_model_exist():
            return []
        else:
            model = self._lda_model
            topics_to_tokens = model.show_topics(
                topics=self.DEFAULT_NUM_TOPICS,
                topn=25,
                log=False,
                formatted=False)

            for topic_id, tokens in enumerate(topics_to_tokens):
                topics_tokens_map[topic_id] = self.list_of_tuples_to_hash(
                    tokens)

            return topics_tokens_map

    def list_of_tuples_to_hash(self, tokens):
        tokens_hash = defaultdict(float)
        for token_probability, token in tokens:
            tokens_hash[token] = token_probability
        return tokens_hash

Example #36

0

Show file

File: run_experiments.py Project: anmolkapoor/explore-arxiv-using-lda-gensim-topic-modelling

    dtm = vectorizer.fit_transform(docs)
    sparse.save_npz(dtm_path, dtm)
    tokens = vectorizer.get_feature_names()
    vocab_size = len(tokens)
    pd.Series(tokens).to_csv(token_path, index=False)

    id2word = pd.Series(tokens).to_dict()
    corpus = Sparse2Corpus(dtm, documents_columns=False)

    # dictionary = Dictionary.from_corpus(corpus=train_corpus, id2word=id2word)

    # for n_topics in [3, 5, 7, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 75, 100]:
    for n_topics in [5, 10, 15, 20, 30]:
        print(n_topics, end=' ', flush=True)
        lda = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)

        doc_topics = pd.DataFrame()
        for i, topics in enumerate(lda.get_document_topics(corpus)):
            doc_topics = pd.concat([
                doc_topics,
                pd.DataFrame(topics, columns=['topic', 'value']).assign(doc=i)
            ])
        doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv',
                          index=False)

        model_file = datapath((model_path / f'{key}_{n_topics}').resolve())
        lda.save(model_file)
        train_lda = LdaModel(corpus=train_corpus,
                             num_topics=n_topics,
                             id2word=pd.Series(train_tokens).to_dict())

Example #37

0

Show file

File: LDAParagraphs.py Project: fabs200/TIS-Innovation-LDA-Sentiment

id2word_nouns = dict_nouns.id2token

# Display
#pp.pprint(id2word_nouns)

# Display results of Corpus
# print(corpus_nouns)
# print('Number of unique tokens: {}'.format(len(dict_nouns)))
# print('Number of documents: {}'.format(len(corpus_nouns)))

# TODO: save corpus and dctionary to disk and load them back
# save to path_lda_data

lda_nouns = LdaModel(corpus=corpus_nouns,
                     id2word=id2word_nouns,
                     num_topics=10,
                     iterations=300,
                     eval_every=1)

lda_nouns.print_topics(-1)

# Print the Keyword in the 10 topics
pp.pprint(lda_nouns.print_topics())

########################
########################

#u_mass coherence measure
from gensim.models.coherencemodel import CoherenceModel
lda_nouns_cm = CoherenceModel(model=lda_nouns,
                              corpus=corpus_nouns,

Example #38

0

Show file

File: dbscan.py Project: Tann-chen/classification-resident-request

def run_lda_with_entropy(industry_lda, token_dict, max_k=5):
    common_dictionary = corpora.Dictionary(industry_lda)
    common_corpus = [common_dictionary.doc2bow(text) for text in industry_lda]
    ldamodel = LdaModel(corpus=common_corpus, num_topics=max_k + 1, id2word=common_dictionary)
    result = ldamodel.print_topics(num_topics=max_k + 1, num_words=10)
    center_lst = []
    for i in range(max_k + 1):
        result2 = ldamodel.get_topic_terms(topicid=i)
        sum_word = 0
        center = 0
        length = len(result2)
        for v in result2:
            if common_dictionary[v[0]] in token_dict.keys():
                center += token_dict[common_dictionary[v[0]]]
        center_lst.append(center / length)

    industry_with_center_distance = []
    sum_temp5_lst = []
    for i in industry_lda:
        temp2 = []
        for k in i:
            temp = []
            if k in token_dict.keys():
                for j in center_lst:
                    temp.append((cal_sim(np.array(token_dict[k]), j)))
            if len(temp) > 0:
                temp2.append(temp)
        if len(temp2) > 0:
            temp3 = np.array(temp2)
            temp4 = np.mean(temp3, axis=0)
            temp5 = np.sum(temp3)
        else:
            temp4 = []
            for i in range(0, max_k + 1):
                temp4.append(0.0)
            temp5 = temp4

        industry_with_center_distance.append(temp4)
        sum_temp5_lst.append(temp5)

    entro_result_final = {}

    for number, i in enumerate(industry_lda):
        entro_result_2 = []
        for k in i:
            entro_result = []
            if k in token_dict.keys():
                for j in center_lst:
                    temp = cal_sim(np.array(token_dict[k]), j)
                    temp_value = temp / sum_temp5_lst[number]
                    entro_result.append(temp_value * math.log(temp_value))
            entro_result_2.append(entro_result)
        if len(entro_result_2) > 0:
            temp5 = np.zeros(shape=(1, max_k + 1), dtype=float)
            for w in entro_result_2:
                if len(w) > 0:
                    temp4 = np.array(w)
                    temp5 += temp4

            list_temp5 = list(temp5[0])
            entro_result_final[number] = list_temp5.index(max(list_temp5))

    final_result = defaultdict(list)
    for i in range(0, max_k + 1):
        for key, value in entro_result_final.items():
            if value == i:
                final_result[i].append(industry_lda[key])

Example #39

0

Show file

import numpy as np
from gensim.models import LdaMulticore as LdaModel
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train LDA model')
    parser.add_argument('tweet_file', help=('path to twitter downloader dump where each line is a cleaned tweet'))
    parser.add_argument('out_dir', help=('path output file to save the model'))
    parser.add_argument('--include_list', nargs='?', default=None)  # see preprocess for default list
    parser.add_argument('--exclude_list', nargs='?', default=None)  # see preprocess for default list

    parser.add_argument('num_topics', type=int)
    parser.add_argument('--npasses', type=int, default=50)
    parser.add_argument('--decay', type=float, default=.5)
    parser.add_argument('--chunksize', type=int, default=2000)

    lda_filename    = 'lda/middle_east_100.lda'
    args = parser.parse_args()
    corpus = corpora.MmCorpus('lda/lda_middle_east.mm')
    dictionary = corpora.Dictionary.load('lda/lda_middle_east.dict')

    lda = LdaModel(corpus, num_topics=100,
                   alpha=1./100, eta=.2, chunksize=10000,
                   workers=5, passes=100, decay=0.75,
                   id2word=dictionary)

    print('Saving model')
    lda.print_topics()
    lda.save(lda_filename)
    print("lda saved in %s " % lda_filename)

Example #40

0

Show file

File: LDA_run.py Project: zll17/Neural_Topic_Models

def main():
    global args

    taskname = args.taskname
    no_below = args.no_below
    no_above = args.no_above
    num_iters = args.num_iters
    n_topic = args.n_topic
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    bkpt_continue = args.bkpt_continue
    use_tfidf = args.use_tfidf
    rebuild = args.rebuild
    auto_adj = args.auto_adj

    docSet = DocDataset(taskname,
                        no_below=no_below,
                        no_above=no_above,
                        rebuild=rebuild)
    if auto_adj:
        no_above = docSet.topk_dfs(topk=20)
        docSet = DocDataset(taskname,
                            no_below=no_below,
                            no_above=no_above,
                            rebuild=rebuild,
                            use_tfidf=False)

    model_name = 'LDA'
    msg = 'bow' if not use_tfidf else 'tfidf'
    run_name = '{}_K{}_{}_{}'.format(model_name, n_topic, taskname, msg)
    if not os.path.exists('logs'):
        os.mkdir('logs')
    if not os.path.exists('ckpt'):
        os.mkdir('ckpt')
    loghandler = [
        logging.FileHandler(filename=f'logs/{run_name}.log', encoding="utf-8")
    ]
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(message)s',
                        handlers=loghandler)
    logger = logging.getLogger(__name__)

    if bkpt_continue:
        print('loading model ckpt ...')
        lda_model = gensim.models.ldamodel.LdaModel.load(
            'ckpt/{}.model'.format(run_name))

    # Training
    print('Start Training ...')

    if use_tfidf:
        tfidf = TfidfModel(docSet.bows)
        corpus_tfidf = tfidf[docSet.bows]
        #lda_model = LdaMulticore(list(corpus_tfidf),num_topics=n_topic,id2word=docSet.dictionary,alpha='asymmetric',passes=num_iters,workers=n_cpu,minimum_probability=0.0)
        lda_model = LdaModel(list(corpus_tfidf),
                             num_topics=n_topic,
                             id2word=docSet.dictionary,
                             alpha='asymmetric',
                             passes=num_iters)
    else:
        #lda_model = LdaMulticore(list(docSet.bows),num_topics=n_topic,id2word=docSet.dictionary,alpha='asymmetric',passes=num_iters,workers=n_cpu)
        lda_model = LdaModel(list(docSet.bows),
                             num_topics=n_topic,
                             id2word=docSet.dictionary,
                             alpha='asymmetric',
                             passes=num_iters)

    save_name = f'./ckpt/LDA_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
    lda_model.save(save_name)

    # Evaluation
    print('Evaluation ...')
    topic_words = get_topic_words(model=lda_model,
                                  n_topic=n_topic,
                                  topn=15,
                                  vocab=docSet.dictionary)

    (cv_score, w2v_score, c_uci_score,
     c_npmi_score), _ = calc_topic_coherence(topic_words,
                                             docs=docSet.docs,
                                             dictionary=docSet.dictionary)

    topic_diversity = calc_topic_diversity(topic_words)

    result_dict = {
        'cv': cv_score,
        'w2v': w2v_score,
        'c_uci': c_uci_score,
        'c_npmi': c_npmi_score
    }
    logger.info('Topics:')

    for idx, words in enumerate(topic_words):
        logger.info(f'##{idx:>3d}:{words}')
        print(f'##{idx:>3d}:{words}')

    for measure, score in result_dict.items():
        logger.info(f'{measure} score: {score}')
        print(f'{measure} score: {score}')

    logger.info(f'topic diversity: {topic_diversity}')
    print(f'topic diversity: {topic_diversity}')

Example #41

0

Show file

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word=dictionary, passes=50)
print('used: {:.2f}s'.format(time() - start))
print(ldamodel.print_topics(num_topics=2, num_words=4))

for i in ldamodel.print_topics():
    for j in i:
        print(j)

ldamodel.save(MODEL_FILE)

from gensim.models import LdaModel
loading = LdaModel.load(MODEL_FILE)

print(loading.print_topics(num_topics=2, num_words=4))


def pre_new(doc):
    one = cleaning(doc).split()
    two = dictionary.doc2bow(one)
    return two


pre_new('new article that to be classified by trained model!')

belong = loading[(
    pre_new('new article that to be classified by trained model!'))]
print(belong)

Example #42

0

Show file

File: display.py Project: NguyenVu06/FDA_OpenData

import logging

from gensim.models import LdaModel
from gensim import corpora


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary_path = "topics_labels/models/dictionary.dict"
corpus_path = "topics_labels/models/corpus.lda-c"
lda_num_topics = 25
lda_model_path = "topics_labels/models/lda_model_50_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

topics = [5,10,15,20,25]
for num_topics in topics:
	print "number of topics:", num_topics
	i = 0
	for topic in lda.show_topics(num_topics):
	    print '#' + str(i) + ': ' + topic
	    i += 1

Example #43

0

Show file

File: predict.py Project: ClaasM/VideoArticleRetrieval

"""
Classifies all articles according to the LDA model
"""
import os
from multiprocessing.pool import Pool

import psycopg2
from gensim import corpora
from gensim.models import LdaModel

from src.features.text import article_tokenizer
from src.visualization.console import StatusVisualization

dictionary = corpora.Dictionary.load(os.environ['MODEL_PATH'] +
                                     'articles.dict')
model = LdaModel.load(os.environ['MODEL_PATH'] + 'articles.lda')


def classify(article):
    source_url, text = article
    tokens = article_tokenizer.tokenize(text)
    """
    doc_bow = [dictionary.doc2bow(token) for token in [tokens]]
    doc_lda = model.get_document_topics(doc_bow,
                                        minimum_probability=None,
                                        minimum_phi_value=None,
                                        per_word_topics=False)
    topics = doc_lda[0]
    topics_ret = dict()
    for topic in topics:
        print(topics)

Example #44

0

Show file

File: vis_experiments.py Project: gabrielmahia/quAInfa_and_the_stock_market

for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):

    print(min_df, max_df, binary)

    vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
    try:
        dtm = sparse.load_npz(vocab_path / f'dtm.npz')
        tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True)
    except FileNotFoundError:
        print('missing')
        continue
    corpus = Sparse2Corpus(dtm, documents_columns=False)
    id2word = tokens.to_dict()
    dictionary = Dictionary.from_corpus(corpus, id2word)

    for num_topics in topics:
        print(num_topics, end=' ')
        model_path = vocab_path / str(num_topics) / str(passes) / 'lda'
        if model_path.exists():
            lda = LdaModel.load(model_path.as_posix())
        else:
            continue
        start = time()
        vis = prepare(lda, corpus, dictionary, mds='tsne')
        terms = vis.topic_info
        terms = terms[terms.Category != 'Default']
        pyLDAvis.save_html(vis, (model_path / 'ldavis.html').as_posix())
        terms.to_csv(model_path / 'relevant_terms.csv', index=False)
        duration = time() - start
        print(format_time(duration))

Example #45

0

Show file

File: app.py Project: hugomailhot/corpus-explorer

    type=int,
)
args = parser.parse_args()

print('Reading dataset')
data = pd.read_parquet(args.input_filepath)

print('Normalizing text')
data.text = data.text.map(nlp.normalize_text)

print('Building docterm matrix')
docterm, dictionary = nlp.get_docterm_matrix(data.text)
doclength = np.array([sum(x[1] for x in doc) for doc in docterm])

print('Training LDA model')
lda = LdaModel(docterm, num_topics=args.n_topics)

print('Getting document topics')
doctopics = corpus2csc([lda.get_document_topics(doc) for doc in docterm])
termtopics = lda.get_topics()

print('Computing topic volume time series')
topic_volume_over_time = nlp.get_topic_volume_over_time(data, doctopics, 20)

print('Computing topic coordinates')
topic_coordinates = nlp.get_topic_coordinates(termtopics, method='mds')
topic_proportions = nlp.get_topic_proportions(doctopics, doclength)

print('Computing term frequencies')
term_frequencies = nlp.get_term_frequencies(docterm, termtopics,
                                            topic_proportions, doclength)

Example #46

0

Show file

File: test_tmdiff.py Project: jakfarshodiq230/python-LSA_TF-IDF

class TestLdaDiff(unittest.TestCase):
    def setUp(self):
        self.dictionary = common_dictionary
        self.corpus = common_corpus
        self.num_topics = 5
        self.n_ann_terms = 10
        self.model = LdaModel(corpus=self.corpus,
                              id2word=self.dictionary,
                              num_topics=self.num_topics,
                              passes=10)

    def testBasic(self):
        # test for matrix case
        mdiff, annotation = self.model.diff(self.model,
                                            n_ann_terms=self.n_ann_terms)

        self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
        self.assertEqual(len(annotation), self.num_topics)
        self.assertEqual(len(annotation[0]), self.num_topics)

        # test for diagonal case
        mdiff, annotation = self.model.diff(self.model,
                                            n_ann_terms=self.n_ann_terms,
                                            diagonal=True)

        self.assertEqual(mdiff.shape, (self.num_topics, ))
        self.assertEqual(len(annotation), self.num_topics)

    def testIdentity(self):
        for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
            # test for matrix case
            mdiff, annotation = self.model.diff(self.model,
                                                n_ann_terms=self.n_ann_terms,
                                                distance=dist_name)

            for row in annotation:
                for (int_tokens, diff_tokens) in row:
                    self.assertEqual(diff_tokens, [])
                    self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(
                np.allclose(np.diag(mdiff),
                            np.zeros(mdiff.shape[0], dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(
                    np.allclose(mdiff, np.zeros(mdiff.shape,
                                                dtype=mdiff.dtype)))

            # test for diagonal case
            mdiff, annotation = \
                self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True)

            for (int_tokens, diff_tokens) in annotation:
                self.assertEqual(diff_tokens, [])
                self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(
                np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(
                    np.allclose(mdiff, np.zeros(mdiff.shape,
                                                dtype=mdiff.dtype)))

    def testInput(self):
        self.assertRaises(ValueError,
                          self.model.diff,
                          self.model,
                          n_ann_terms=self.n_ann_terms,
                          distance='something')
        self.assertRaises(ValueError,
                          self.model.diff, [],
                          n_ann_terms=self.n_ann_terms,
                          distance='something')

Example #47

0

Show file

class TopicModel:
    def __init__(self, topicCollection, string):
        if string.lower() == "nmf":
            self.model = "NMF"
            print("Topic Extraction Model: sklearn.NMF")
        else:
            self.model = "LDA"
            print("Topic Extraction Model: gensim.LDAModel")
        self.stemmer = PorterStemmer()

    #Train the LDA model on the current discussion
    def train(self, sentences):
        if self.model == "NMF":
            self.sentenceData = []
            for sentence in sentences:
                self.sentenceData.append(preprocess(sentence, self.stemmer))
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=1500,
                ngram_range=(1, 2),
                preprocessor=' '.join,
                stop_words='english'
            )
            tfidf = self.tfidf_vectorizer.fit_transform(self.sentenceData)
            self.nmf = NMF(n_components=2, solver="mu")
            self.W = self.nmf.fit_transform(tfidf)
            self.H = self.nmf.components_
        else:
            sentenceData = []
            for sentence in sentences:
                sentenceData.append(preprocess(sentence, self.stemmer))
            self.dictionary = Dictionary(sentenceData)
            bow_corpus = [self.dictionary.doc2bow(doc) for doc in sentenceData]
            self.lda_model = LdaModel(bow_corpus, num_topics=2, id2word=self.dictionary, passes=10)

    #Classify a given sentence to one of the topics found in training
    def classify(self, sentence):
        if self.model == "NMF":
            index = self.sentenceData.index(preprocess(sentence, self.stemmer))
            topic = self.W.argmax(axis=1)[index]
            return "Topic " + str(topic)
        else:
            bow_vector = self.dictionary.doc2bow(preprocess(sentence, self.stemmer))
            return "Topic " + str(sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1])[0][0])

    #Shows the terms of a given topic
    def showTerms(self, topic):
        if self.model == "NMF":
            terms = ""
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == int(topic.split(' ')[-1]):
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for term in top_features:
                terms += term + ", "
            print(topic.split(' ')[-1] + " " + terms)
            return terms
        else:
            terms = ""
            topic = int(topic.split(" ")[-1])
            for term in self.lda_model.show_topic(topic):
                terms += term[0] + ", "
            print(str(topic) + " " + terms)
            return terms

    #Gets the probability or the coefficient of the given term in the topic
    def getCoeff(self, topic, term):
        if self.model == "NMF":
            weights = []
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == topic:
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for coeff, terms in zip(weights, top_features):
                if terms == term:
                    return coeff
        else:
            topic = int(topic.split(" ")[-1])
            for terms in self.lda_model.show_topic(topic):
                if terms[0] == term:
                    return terms[1]

    #Shows all the topics found in training
    def showTopics(self):
        if self.model == "NMF":
            ret = []
            for topic_idx, topicID in enumerate(self.H):
                ret.append("Topic " + str(topic_idx))
            return ret
        else:
            topics = self.lda_model.print_topics()
            ret = []
            for topic in topics:
                ret.append("Topic " + str(topic[0]))
            return ret

    #Returns a flag to check what model is deployed at the moment
    def getModel(self):
        return self.model

Example #48

0

Show file

File: LDAPredictor.py Project: Libardo1/yelp-district-clustering

 def __init__(self):
     self.dictionary = corpora.Dictionary.load("models/dictionary.dict")
     self.lda = LdaModel.load("models/lda_model.lda")

Example #49

0

Show file

File: script638.py Project: darkblue-b/kaggleScape

#to check
print("Wordlist from the dictionary lookup:", dictionary[21], dictionary[22],
      dictionary[23], dictionary[24], dictionary[25], dictionary[26],
      dictionary[27])

# In[ ]:

#scale it to all text
corpus = [dictionary.doc2bow(text) for text in all_text]
end_corpus = time.time()
print("Time till corpus creation:", end_clean - start_time, "s")

# In[ ]:

#create the LDA model
ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)
end_lda = time.time()
print("Time till LDA model creation:", end_lda - start_time, "s")

# In[ ]:

pyLDAvis.enable_notebook()

# In[ ]:

pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

# In[ ]:

end_viz = time.time()
print("Time till viz:", end_viz - start_time, "s")

Example #50

0

Show file

                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values


# Code starts here
topic_list, coherence_value_list = compute_coherence_values(
    dictionary=dictionary,
    corpus=doc_term_matrix,
    texts=doc_clean,
    start=1,
    limit=41,
    step=5)

# Finding the index associated with maximum coherence value
max_index = coherence_value_list.index(max(coherence_value_list))
opt_topic = topic_list[max_index]

print("optimum no of topics:", opt_topic)

# Implementing LDA with the optimum no. of topic
lda_model = LdaModel(corpus=doc_term_matrix,
                     num_topics=opt_topic,
                     id2word=dictionary,
                     iterations=10,
                     passes=30,
                     random_state=0)

lda_model.print_topics(5)

Example #51

0

Show file

File: dummyPyStuff.py Project: deakkon/TechDashboard

from TechDashAPI.mysqlUtilities import connectMySQL
from TechDashAPI.ContentExtractor import ContentExtractor
from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer
from TechDashAPI.createDOM import createDom
from TechDashAPI.util import utilities
from TechDashAPI.topicModeling import techDashTopicModel

from gensim.models import LdaModel

db = connectMySQL(db='xpath', port=3366)
filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'
utilitiesFunctions = utilities()

modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/'
modelName ='fullModel_100P_20T'
model = LdaModel.load(modelDestination+modelName+'.lda',  mmap=None)
topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T')

#===============================================================================
# UPDATE ALL ARTICLES TO NEW TOPICS
#===============================================================================

sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """

db.executeQuery(sqlQuery)

for item in db._connectMySQL__results:
    #===========================================================================
    # print item
    #===========================================================================
    topicModelCat = topicModel.getDocumentTopics(item[1])

Example #52

0

Show file

corpus = load_obj('LDABOWcorpus-application')
#%%
# lda model training  (ETA 10 mins)

num_topics = 6  # topics declared (based on MATLAB tut)
chunk_size = 300
t1 = time.time()
# low alpha => each doc only rep by small num topics & vice versa
# low eta means each topic only rep by small num words and vice versa
lda = LdaModel(
    corpus=corpus,
    num_topics=num_topics,
    id2word=dct,
    alpha='auto',
    random_state=100,
    # eta=None,
    update_every=1,
    chunksize=chunk_size,
    minimum_probability=0.0,
    # iterations=100,
    # gamma_threshold=0.001,
    passes=10,
    per_word_topics=True)

lda.get_document_topics(bow=corpus, per_word_topics=True)
tpl = lda.print_topics(num_topics=6, num_words=5)
topic, contrib = zip(*tpl)

t2 = time.time()
print("Time to train LDA model on", len(df), "articles:", (t2 - t1) / 60,
      "min")

Example #53

0

Show file

File: models.py Project: Mistscream/topic-modeling

from gensim import corpora,similarities
from gensim.models import LdaModel, LsiModel

lda_model = LdaModel.load('./data/lda_model')
lsi_model = LsiModel.load('./data/lsi_model')
id2word = corpora.Dictionary.load('./data/id2word')
index = similarities.MatrixSimilarity.load('./data/index')