Beispiel #1
0
def train_lda(corpus, token_dict, num_topics, update, passes, csize):
    return ldamodel.LdaModel(corpus=corpus,
                             id2word=token_dict,
                             num_topics=num_topics,
                             update_every=update,
                             passes=passes,
                             chunksize=csize)
Beispiel #2
0
    def testTopicSeeding(self):
        passed = False
        for topic in range(2):
            # try seeding it both ways round, check you get the same
            # topics out but with which way round they are depending
            # on the way round they're seeded
            for i in range(5): # restart at most 5 times

                eta = numpy.ones((2, len(dictionary))) * 0.5
                system = dictionary.token2id[u'system']
                trees = dictionary.token2id[u'trees']

                # aggressively seed the word 'system', in one of the
                # two topics, 10 times higher than the other words
                eta[topic, system] *= 10

                model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=200, eta=eta)
                model.update(corpus)

                topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)]

                # check that the word system in the topic we seeded, got a high weight,
                # and the word 'trees' (the main word in the other topic) a low weight --
                # and vice versa for the other topic (which we didn't seed with 'system')
                result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)],
                          [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]]
                expected = [[0.385, 0.022],
                            [0.025, 0.157]]
                passed = numpy.allclose(result, expected, atol=1e-2)
                if passed:
                    break
                logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                                (i, result, expected))
            self.assertTrue(passed)
Beispiel #3
0
    def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_,
                      lda_inference_max_iter, chunksize):
        """
        Inference or E- Step.
        This is used to set up the gensim LdaModel to be used for each time-slice.
        It also allows for Document Influence Model code to be written in.
        """
        num_topics = self.num_topics
        vocab_len = self.vocab_len
        bound = 0.0

        lda = ldamodel.LdaModel(num_topics=num_topics,
                                alpha=self.alphas,
                                id2word=self.id2word)
        lda.topics = np.array(
            np.split(np.zeros(vocab_len * num_topics), vocab_len))
        ldapost = LdaPost(max_doc_len=self.max_doc_len,
                          num_topics=num_topics,
                          lda=lda)

        model = "DTM"
        if model == "DTM":
            bound, gammas = self.inferDTMseq(corpus, topic_suffstats, gammas,
                                             lhoods, lda, ldapost, iter_,
                                             bound, lda_inference_max_iter,
                                             chunksize)
        elif model == "DIM":
            self.InfluenceTotalFixed(corpus)
            bound, gammas = self.inferDIMseq(corpus, topic_suffstats, gammas,
                                             lhoods, lda, ldapost, iter_,
                                             bound, lda_inference_max_iter,
                                             chunksize)

        return bound, gammas
Beispiel #4
0
    def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in xrange(5):  # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary,
                                      num_topics=2,
                                      passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(
                transformed,
                2)  # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(
                sorted(vec), sorted(expected),
                atol=1e-2)  # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning(
                "LDA failed to converge on attempt %i (got %s, expected %s)" %
                (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed)
Beispiel #5
0
 def initialize(self, myid, dispatcher, **model_params):
     self.lock_update = threading.Lock()
     self.jobsdone = 0  # how many jobs has this worker completed?
     self.myid = myid  # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
     self.dispatcher = dispatcher
     logger.info("initializing worker #%s" % myid)
     self.model = ldamodel.LdaModel(**model_params)
Beispiel #6
0
    def topic_modeling(self, data_table, cols, n_topics):

        # Identify the words and clean the table.
        data_table, words = self.identify_words(data_table, cols)

        # Create a dictionary based on the words we have identified per row.
        dict_topics = gensim.corpora.Dictionary(data_table[self.col_name])
        # Create a corpus containing all words.
        corpus = [dict_topics.doc2bow([word]) for word in words]

        # Apply LDA.
        model = lda.LdaModel(corpus, id2word=dict_topics, num_topics=n_topics)

        # Get the topics we found.
        topics = model.show_topics(num_topics=n_topics,
                                   num_words=10,
                                   log=False,
                                   formatted=False)

        # Create columns for the topics.
        for topic in range(0, n_topics):
            data_table[f'{cols[0]}_topic_{topic}'] = 0.0

        # Score the topics per row and set the values accordingly.
        for i in range(0, len(data_table.index)):
            topic_scores = model[dict_topics.doc2bow(
                data_table[self.col_name][i])]
            for score in topic_scores:
                data_table.iloc[i,
                                data_table.columns.get_loc(
                                    f'{cols[0]}_topic_{score[0]}')] = score[1]
        # Remove the temporary column we had created for the cleaned lists of words.
        del data_table[self.col_name]
        return data_table
Beispiel #7
0
def lda_mod_get(x, passes=10):
    newmod = ldamodel.LdaModel(corpus_train,
                               id2word=dictionary_train,
                               num_topics=x,
                               passes=passes,
                               per_word_topics=True)
    return (newmod)
Beispiel #8
0
    def fit_LDA(self):
        """ 
        Fit data in LDA. currently assuming that number of cores remains constant at 1. 
        
        :param {str} lda_filepath: 
            Where to save lda model to. 
            
        :param {str} pyldavis_filepath: 
            Where to save pyldavis model to. 
        
        :param {int} num_topics: 
            Number of topics the LDA model should look for. 
            
        """

        self.lda = ldamodel.LdaModel(corpus=self.corpus,
                                     alpha='auto',
                                     id2word=self.dictionary,
                                     **self.run_parameters)
        lda_vis_serialized = pyLDAvis.gensim.prepare(self.lda,
                                                     self.corpus,
                                                     self.dictionary,
                                                     sort_topics=False)
        pyLDAvis.save_html(lda_vis_serialized, self.pyldavis_filepath)
        self.lda.save(self.lda_filepath)
def do(documents):
    #remove common words and tokenize
    stoplist = set('for a of the and to in'.split())
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]

    #remove words that appear only once
    # all_tokens = sum(texts, [])
    # tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
    # texts = [[word for word in text if word not in tokens_once] for text in texts]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # I can print out the topics for LSA
    # lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=5)
    # corpus_lsi = lsi[corpus]

    # for l,t in izip(corpus_lsi,corpus):
    # 	print l,"#",t
    # 	print
    # for top in lsi.print_topics(2):
    # 	print top

    # I can print out the documents and which is the most probable topics for each doc.
    lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=1)
    print lda.show_topics()

    topics_matrix = lda.show_topics(formatted=False, num_words=2)
    print topics_matrix
    print np.array(topics_matrix)
Beispiel #10
0
def find_lda_context(train_records, test_records):
    """
    Uses the training records to create a topic model and then updates both
    the training and testing records with a vector of probabilities for each
    topic from the recently created topic model
    """

    dictionary = preprocess_records(train_records, test_records)
    corpus = [record[Constants.CORPUS_FIELD] for record in train_records]
    print(corpus)
    topic_model = ldamodel.LdaModel(
        corpus, id2word=dictionary,
        num_topics=num_topics,
        passes=Constants.LDA_MODEL_PASSES,
        iterations=Constants.LDA_MODEL_ITERATIONS)

    print(corpus)
    for i in range(num_topics):
        print(topic_model.show_topic(i, topn=2))

    records = train_records + test_records

    for record in records:
        document_topics =\
            topic_model.get_document_topics(record[Constants.CORPUS_FIELD])
        lda_context = [document_topic[1] for document_topic in document_topics]
        record['lda_context'] = lda_context

        context_topics = {}
        for i in range(num_topics):
            topic_id = 'topic' + str(i)
            context_topics[topic_id] = document_topics[i][1]

        record[Constants.CONTEXT_TOPICS_FIELD] = context_topics
Beispiel #11
0
 def generateTopic(self,wordsLists, method=TopicMethod.LSI, numTopics=25):
     """step4: 主题向量转换"""
     """Note:
            采用LDA转换后,经文本相似度比较后发现效果一点都不好,
            故而采用LSI转换,效果不错.
                             Created by flx on 2018-4-7
     """
     bowCorpus = self.generateBow(wordsLists)
     tfidfCorpus = self.generateTfidf(bowCorpus)
     if method == TopicMethod.LDA:
         instance = ldamodel.LdaModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics)
         CacheUtil.dumpTopicModel(instance)
     elif method == TopicMethod.LSI:
         instance = lsimodel.LsiModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics)
         CacheUtil.dumpTopicModel(instance)
     dstCorpus = instance[tfidfCorpus]
     features=[]
     # gensim转换后的格式是tuple列表格式,如:
     #   vec = [(0, 0.12345), (2,0.458124),(4,0.485263),(7,0.589542)...]
     # 只保存向量中的非零值
     # 我们转换为普通向量形式
     for doc in dstCorpus:
         vector=[0]*numTopics
         for pair in doc:
             vector[pair[0]] = pair[1]
         features.append(vector)
     return features
Beispiel #12
0
 def train(self,
           filepath,
           dict_path,
           docs,
           num_topics=5,
           passes=100,
           chunksize=2000,
           alpha=0.5,
           eta=0.5):
     if (path.exists(filepath)):
         LOGGER.info('Model already exists...load model')
         self._inner_model = ldamodel.LdaModel.load(filepath)
     else:
         start = time.time()
         clean_docs = [d for d in docs]
         if (path.exists(dict_path)):
             LOGGER.info('Dictionary already exists...loading dictionary')
             self._dict = corpora.Dictionary.load(dict_path)
         else:
             self._dict = corpora.Dictionary(clean_docs)
             self._dict.save(dict_path)
             self.dict_time = (time.time() - start)
         corpus_dict = self._dict
         corpus = [self._dict.doc2bow(x) for x in clean_docs]
         self._inner_model = ldamodel.LdaModel(corpus,
                                               num_topics=num_topics,
                                               id2word=corpus_dict,
                                               passes=passes,
                                               chunksize=chunksize,
                                               alpha=alpha,
                                               eta=eta)
         self._inner_model.save(filepath)
         self.model_time = (time.time() - start)
     return self
Beispiel #13
0
	def compute(self):
		vec_texts = [text.split() for text in self.texts]
		write("\n    "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else ''
		dictionary = Dictionary(vec_texts)
		write("[OK]") if self.debug else ''
		write("\n    "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' 
		corpus = [dictionary.doc2bow(vec) for vec in vec_texts]
		write("[OK]") if self.debug else ''
		write("\n    "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' 
		tfidf_space = TfidfModel(corpus)
		tfidf_corpus = tfidf_space[corpus]
		if self.method == 'TFIDF':
			self.space = tfidf_space
			self.index = MatrixSimilarity(tfidf_corpus)
		elif self.method == 'LSI': 
			self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'RP': 
			self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'LDA':
			self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, 
														 num_topics=self.num_t)
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		self.dictionary = dictionary
		write("[OK]\n") if self.debug else ''
Beispiel #14
0
def gen_V_lda(R_prime,num_topics=10,trained_lda=None,dictionary=None):
	# delimiters=R_prime['delimiters']
	# bag_o_words=R_prime['bag_o_words']
	#source=R_prime['source']
	topics=[]
	topic_probability=[]
	bag_o_words=OrderedDict()
	for key in R_prime.keys():
		bag_o_words[key]=sum(R_prime[key])
	words=[[key]*bag_o_words[key] for key in bag_o_words.keys()]
	# dictionary=corpora.Dictionary(words)
	# corpus=[dictionary.doc2bow(text) for text in words]

	if trained_lda!=None:
		assert isinstance(trained_lda,ldamodel.LdaModel)
		assert isinstance(dictionary,corpora.Dictionary)
		corpus=[dictionary.doc2bow(text) for text in words]
		trained_lda.update(corpus)
		topics=lda.show_topics(num_topics=num_topics,num_words=len(dictionary),formatted=False)
	else:
		dictionary=corpora.Dictionary(words)
		corpus=[dictionary.doc2bow(text) for text in words]
		lda=ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=100)
		topics=lda.show_topics(num_topics=num_topics,num_words=len(dictionary),formatted=False)
	V_prime=OrderedDict()
	for word in R_prime.keys():
		V_prime[word]=[0]*num_topics

	for i,topic in enumerate(topics):
		for entry in topic:
			V_prime[entry[1]][i]=entry[0]

	return V_prime
Beispiel #15
0
def gentler_lda_entire_corpus(in_folder, ofile, wordlist, num_topics=30):
    files = sorted([
        i for i in os.listdir(in_folder) if os.path.isfile(join(in_folder, i))
    ])
    #print(files)
    months = []
    with open(ofile, 'w') as f:
        f.write(pprint.pformat(locals()))
        for file in files:
            month = file.split('.')[-2][-8:-3]
            f.write(f'\n{"="*40}\n\nmonth {month}')
            mcorpus = textcorpus.conditionalCorpus(join(
                in_folder, file))  #, lines_are_documents=True)
            lda = ldamodel.LdaModel(mcorpus,
                                    num_topics=num_topics,
                                    id2word=mcorpus.dictionary)
            topics = lda.get_topics()
            f.write(
                pprint.pformat(lda.show_topics(num_topics, num_words=30)) +
                '\n')
            for w in wordlist:
                try:
                    topics = lda.get_term_topics(w)
                    f.write(f'\t{w}: {pprint.pformat(topics)}\t')
                except:
                    f.write(f'\t{w}: outofvocab\t')
            print(f'month {month} LDA complete')
def lda_model(corpus_tfidf,
              dictionary,
              num_topics=10,
              num_words=10,
              vis='off'):

    lda_tfidf = ldamodel.LdaModel(corpus_tfidf, num_topics)
    lda_tfidf.save('lda_model')
    topics = lda_tfidf.show_topics(num_topics,
                                   num_words,
                                   log=False,
                                   formatted=False)
    words = dictionary.token2id
    topics_decoded = dict()

    for i in range(len(topics)):
        topic_no = 'Topic ' + str(i)
        topics_decoded[topic_no] = {}
        v = topics[i][1]
        for j in range(len(v)):
            word = list(words.keys())
            [(list(words.values())).index(int(v[j][0]))]
            topics_decoded[topic_no][word] = v[j][1]

    if vis == 'on':
        pyLDAvis.enable_notebook()
        pyLDAvis.display(
            pyLDAvis.gensim.prepare(lda_tfidf, corpus_tfidf, dictionary))

    return topics_decoded
Beispiel #17
0
    def __getitem__(self, doc):
        """
        Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed.
        """
        lda_model = ldamodel.LdaModel(num_topics=self.num_topics,
                                      alpha=self.alphas,
                                      id2word=self.id2word)
        lda_model.topics = np.array(
            np.split(np.zeros(self.vocab_len * self.num_topics),
                     self.vocab_len))
        ldapost = LdaPost(num_topics=self.num_topics,
                          max_doc_len=len(doc),
                          lda=lda_model,
                          doc=doc)

        time_lhoods = []
        for time in range(0, self.num_time_slices):
            lda_model = self.make_lda_seq_slice(lda_model,
                                                time)  # create lda_seq slice
            lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
            time_lhoods.append(lhood)

        doc_topic = ldapost.gamma / ldapost.gamma.sum()
        # should even the likelihoods be returned?
        return doc_topic
Beispiel #18
0
def create_model(K):
    pre_collection = preprocess()

    dictionary = corpora.Dictionary(pre_collection)
    dictionary.filter_extremes(no_below=2, no_above=0.8, keep_n=500)
    docs_ids = [dictionary.doc2bow(doc) for doc in pre_collection]
    lda = glda.LdaModel(docs_ids, num_topics=K, id2word=dictionary)
    return lda, dictionary, docs_ids, pre_collection
Beispiel #19
0
 def testPersistence(self):
     model = ldamodel.LdaModel(self.corpus, num_topics=2)
     model.save(testfile())
     model2 = ldamodel.LdaModel.load(testfile())
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
 def createModel(self, doc):
     dictionary = corpora.Dictionary(doc)
     corpus = [dictionary.doc2bow(text) for text in doc]
     model = ldamodel.LdaModel(
         corpus,
         num_topics=config["topic_modeling"]["num_topics"],
         id2word=dictionary)
     return model, dictionary
Beispiel #21
0
def create_models(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=10)
    print(lda.show_topics())
    vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    print(vis_data)
    pyLDAvis.show(vis_data)
Beispiel #22
0
    def testLargeMmapCompressed(self):
        fname = testfile() + '.gz'
        model = ldamodel.LdaModel(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(fname, sep_limit=0)

        # test loading the large model arrays with mmap
        self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r')
Beispiel #23
0
 def testPersistenceCompressed(self):
     fname = testfile() + '.gz'
     model = ldamodel.LdaModel(self.corpus, num_topics=2)
     model.save(fname)
     model2 = ldamodel.LdaModel.load(fname, mmap=None)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
Beispiel #24
0
def get_lda(c, n_topics=4):
    """
    Get LDA model
    :param n_topics: Number of topics to use in model
    :param c: Corpus object
    :return: LDA model
    """
    lda = ldamodel.LdaModel(c.corpus, id2word=c.dic, num_topics=n_topics)
    return lda
def train_LDA(posts):
    """
    Uses gensim to train an LDA model for topic modeling.
    """
    dct = Dictionary(posts)
    dct.filter_extremes(no_below=2, no_above=0.5)
    corpus = [dct.doc2bow(text) for text in posts]
    model = ldamodel.LdaModel(corpus, num_topics=10, id2word=dct, passes=1)
    return model, corpus
 def suggested_lda_model(self):
     """
     Returns closest corresponding ldamodel object corresponding to current hdp model.
     The hdp_to_lda method only returns corresponding alpha, beta values, and this method returns a trained ldamodel.
     The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
     """
     alpha, beta = self.hdp_to_lda()
     ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
     ldam.expElogbeta[:] = beta
     return ldam
Beispiel #27
0
    def testPersistenceIgnore(self):
        fname = testfile('testPersistenceIgnore')
        model = ldamodel.LdaModel(self.corpus, num_topics=2)
        model.save(fname, ignore='id2word')
        model2 = ldamodel.LdaModel.load(fname)
        self.assertTrue(model2.id2word is None)

        model.save(fname, ignore=['id2word'])
        model2 = ldamodel.LdaModel.load(fname)
        self.assertTrue(model2.id2word is None)
Beispiel #28
0
    def testPersistenceIgnore(self):
        fname = get_tmpfile('gensim_models_lda_testPersistenceIgnore.tst')
        model = ldamodel.LdaModel(self.corpus, num_topics=2)
        model.save(fname, ignore='id2word')
        model2 = ldamodel.LdaModel.load(fname)
        self.assertTrue(model2.id2word is None)

        model.save(fname, ignore=['id2word'])
        model2 = ldamodel.LdaModel.load(fname)
        self.assertTrue(model2.id2word is None)
Beispiel #29
0
 def testTransform(self):
     # create the transformation model
     model = ldamodel.LdaModel(self.corpus, numTopics = 2)
     
     # transform one document
     doc = list(self.corpus)[0]
     transformed = model[doc]
     
     vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests
     expected = [0.0, 1.0]
     self.assertTrue(numpy.allclose(sorted(vec), sorted(expected))) # must contain the same values, up to re-ordering
Beispiel #30
0
def generateTopicModels(dictionary, bow, topicCounts):
    models = []
    alphas = []
    betas = []
    for amountTopics in topicCounts:
        ldaModel = ldamodel.LdaModel(bow, amountTopics, dictionary, passes=20, per_word_topics=False)
        models.append(ldaModel)
        alphas.append('default')
        betas.append('default')
        print('Generated topic model with parameters: ' + "(topics: " + str(amountTopics) + "), (alpha: default" + "), (beta: default)")
    return models, alphas, betas