def create_LDA(comment_dict,
               num_topics=20,
               chunk_size=50,
               max_iter=20,
               from_db=True,
               get_data_func=None):
    lda = None
    text_gen = data_preprocessor(max_iter=max_iter,
                                 from_db=from_db,
                                 get_data_func=get_data_func)
    corpus = []
    for _, stemmed_text, _ in text_gen:
        if len(stemmed_text) != 0:
            corpus.append(comment_dict.doc2bow(stemmed_text))
        if len(corpus) == chunk_size:
            if lda is None:
                lda = LdaModel(corpus=corpus,
                               num_topics=num_topics,
                               id2word=comment_dict,
                               per_word_topics=1,
                               passes=10)
            else:
                lda.update(corpus=corpus)
            corpus = []
    return lda
def main(vocab_file, inv_vocab_file, infiles):
    vocab = load_pickled(vocab_file)
    inv_vocab = load_pickled(inv_vocab_file)

    lda = LdaModel(id2word=inv_vocab, num_topics=200)

    for f in infiles:
        tc = TweetCorpus(f, vocab)
        lda.update(tc)

    lda.save('topics.lda')
Ejemplo n.º 3
0
def gensim_lda(d):
    from gensim import corpora, models
    from gensim.models.ldamodel import LdaModel
    list_doc = []
    for i in range(0,len(d)):
        list_doc = list_doc + d[i]

    dictionary = corpora.Dictionary(list_doc)
    model = LdaModel(num_topics = 20, id2word = dictionary)
    for i in range(0, len(d)):
        print 'Generating corpus and updating model ', i
        corpus = [dictionary.doc2bow(doc) for doc in d[i]]
        model.update(corpus)

    model.save('model_20')
    print model.show_topics(num_topics = 20, num_words = 10)
Ejemplo n.º 4
0
def build_all():
    """
	build and save a bunch of models to evaluate
	"""

    with sqlite3.connect('../database/chat.db') as conn:

        # get vocabulary
        MIN_OCCURANCE = 100
        vocab = Dictionary([
            pd.read_sql(
                'select word from words where freq >= {}'.format(
                    MIN_OCCURANCE), conn)['word'].tolist()
        ])

        # models for different number of topics
        N_EPOCHS = 10
        for n_topics in [i for i in range(5, 26, 5)]:

            # one model per each aggregation style
            for style, sql in zip(['basic', 'user', 'user_day_room'],
                                  get_model_sql()):

                # init model
                lda_model = LdaModel(id2word=vocab,
                                     num_topics=n_topics,
                                     alpha='auto',
                                     per_word_topics=True)

                # do training
                print('training model_{0}_{1}'.format(style, n_topics))
                for epoch in range(N_EPOCHS):
                    print('\tepoch', epoch, '...', end='\r')
                    for chunk in pd.read_sql(sql, conn, chunksize=10000):
                        chunk_corpa = [
                            vocab.doc2bow(text)
                            for text in chunk['lemma'].str.split(' ').tolist()
                        ]
                        lda_model.update(chunk_corpa)
                    print('\tepoch', epoch, '... done!')

                # Save model to disk.
                lda_model.save("saved_models/model_{0}_{1}".format(
                    style, n_topics))
Ejemplo n.º 5
0
def load_or_build_lda_model(conn):

	try:

		# load vocab dictionary
		vocab = SaveLoad.load('../model_development/saved_models/model_user_day_room_10.id2word')

		# load model
		ldamodel = LdaModel.load('../model_development/saved_models/model_user_day_room_10')

		print('Pretrained lda model loaded!')

	except:

		# query for aggregating texts per user per room per day
		sql = """
		select group_concat(lemma, ' ') as lemma
		from lemmas
			join (
				select chat_id
					, from_userid
					, strftime('%Y-%m-%d', sent_at) as sent_day
					, room_id
				from chats
				) using (chat_id)
		where nullif(lemma, '') is not null
		group by from_userid
			, sent_day
			, room_id
		order by random();
		"""

		# get vocabulary
		MIN_OCCURANCE = 100
		vocab = Dictionary([pd.read_sql('select word from words where freq >= {}'.format(MIN_OCCURANCE), conn)['word'].tolist()])

		# models for different number of topics
		N_EPOCHS = 10
		n_topics = 10
		style = 'user_day_room'

		# init model
		lda_model = LdaModel(
			id2word=vocab,
			num_topics=n_topics, 
			alpha='auto',
			per_word_topics=True)

		# do training
		print('training model_{0}_{1}'.format(style, n_topics))
		for epoch in range(N_EPOCHS):
			print('\tepoch', epoch, '...', end='\r')
			for chunk in pd.read_sql(sql, conn, chunksize=10000):
				chunk_corpa = [vocab.doc2bow(text) for text in chunk['lemma'].str.split(' ').tolist()]
				lda_model.update(chunk_corpa)
			print('\tepoch', epoch, '... done!')

		# Save model to disk.
		lda_model.save("saved_models/model_{0}_{1}".format(style, n_topics))

	return vocab, ldamodel
Ejemplo n.º 6
0
dictionary = corpora.Dictionary(resultlist)
corpus = [dictionary.doc2bow(text) for text in resultlist]

with open ('mcdi_word.csv', 'rb')as f:
    reader = csv.reader(f)
    wlist = []
    for row in reader:
        wlist.append(row)

idlist = []

for row in wlist:
    idrow = []
    for key in dictionary.iteritems():
        if key[1].encode('utf-8') in row:
            idrow.append(key[0])
    idlist.append(idrow)

a = 0.05
ntopic = 75
eta_arr = ones((ntopic, len(dictionary))) * 0.5
for x in range(0, len(idlist)):
    for id in idlist[x]:
        eta_arr[x, id] *= 1000


lda = LdaModel(id2word = dictionary, num_topics = ntopic)
lda.update(corpus)
topiclist = lda.print_topics(num_topics = 75, num_words = 50)
lda.save('childs_file_75.model')
                if i == 0:
                    if lda_model != None:
                        del lda_model

                    lda_model = LdaModel(
                        corpus=train_data,
                        id2word=dictionary,
                        num_topics=n_component,
                        decay=learning_decay,
                        iterations=valid_iter,
                        random_state=0,
                    )
                    # lda_model.update(train_data)
                else:
                    lda_model.update(corpus=train_data,
                                     decay=learning_decay,
                                     iterations=valid_iter)

                train_s.append(
                    CoherenceModel(model=lda_model,
                                   corpus=train_data,
                                   dictionary=dictionary,
                                   coherence='u_mass').get_coherence())
                test_s.append(
                    CoherenceModel(model=lda_model,
                                   corpus=test_data,
                                   dictionary=dictionary,
                                   coherence='u_mass').get_coherence())

                train_p.append(lda_model.log_perplexity(train_data))
                test_p.append(lda_model.log_perplexity(test_data))
Ejemplo n.º 8
0
Archivo: lda.py Proyecto: freygit/36
class LDA(object):

    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus = [[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model.update(new_corpus_data)

    def inference(self, document = []):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc =  [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
Ejemplo n.º 9
0
class LDA(object):
    def __init__(self,
                 topics=10,
                 worker=3,
                 pretrained_model=None,
                 dictionary=None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus=[[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data = [
                self._common_dictionary.doc2bow(sentence)
                for sentence in corpus
            ]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data = [
                self._common_dictionary.doc2bow(sentence)
                for sentence in corpus
            ]
            self._model.update(new_corpus_data)

    def inference(self, document=[]):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc = [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
Ejemplo n.º 10
0
def train_LDA(base_path,
              table_paths,
              batch_size,
              limit,
              use_dictionary=False,
              **kwargs):

    model_name = dic2name(kwargs)
    print("Model: ", model_name)
    topic_num = kwargs['tn']

    # Pass 1 get the dictionary
    if use_dictionary == 'True':
        dic = Dictionary.load(
            join(LDA_CACHE, 'dictionary_{}'.format(model_name)))
    else:

        dic = Dictionary([])
        b = 0
        for corpus in corpus_iter(base_path, table_paths, batch_size, limit,
                                  **kwargs):
            dic.add_documents(corpus)
            print('Dictionary batch {}: current dic size {}'.format(
                b, len(dic)))
            b += 1

        # save dictionary
        dic.save(join(LDA_CACHE, 'dictionary_{}'.format(model_name)))

    print("Dictionary size", len(dic))

    # Pass 2 train LDA
    whole_corpus = corpus_iter(base_path, table_paths, batch_size, limit,
                               **kwargs)
    first_batch = next(whole_corpus)
    first_bow = [dic.doc2bow(text, allow_update=False) for text in first_batch]
    #print(first_bow)

    lda = LdaModel(first_bow,
                   id2word=dic,
                   num_topics=topic_num,
                   minimum_probability=0.0)
    batch_no = 0
    print('LDA update batch {}'.format(batch_no))

    for batch in whole_corpus:
        batch_bow = [dic.doc2bow(text, allow_update=False) for text in batch]
        #print(corpus_bow)
        lda.update(batch_bow)
        batch_no += 1
        print('LDA update batch {}'.format(batch_no))

    # Save model to disk.
    temp_file = join(LDA_CACHE, "model_{}".format(model_name))
    lda.save(temp_file)

    print(
        "Training from {} done. Batch_size: {}, long str tokenization threshold: {}, numerical representations: {}.\
          \nTotal size of dictionary: {}".format(table_paths, batch_size,
                                                 kwargs['thr'], kwargs['num'],
                                                 len(dic)))
    return
Ejemplo n.º 11
0
class LDAWDF:
    mysql: mysql.MySQL
    ldamodel: LdaModel
    dictionary = None
    corpus = None

    def __init__(self, mysql):
        self.mysql = mysql
        self.dataFolder = './data/'
        self.saveFile = 'lda_model'
        self.saveFileDict = 'lda_model_dict'

    def trainFromStart(self):
        with self.mysql as db:
            content = db.getContentsText()
        documents = []
        for item in content:
            documents.append(item['content'].split())

        self.dictionary = corpora.Dictionary(documents)

        self.dictionary.filter_extremes(no_below=5, no_above=0.5)

        doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents]

        self.corpus = doc_term_matrix

        # Running and Training LDA model on the document term matrix.
        print("Starting to train LDA Model...")
        self.ldamodel = LdaModel(
            doc_term_matrix,
            num_topics=200,
            id2word=self.dictionary,
            passes=100)

    def printTest(self):
        print(self.ldamodel.print_topics(num_topics=10, num_words=5))

    def save(self):
        self.ldamodel.save(self.dataFolder + self.saveFile)
        self.dictionary.save(self.dataFolder + self.saveFileDict)

    def canLoad(self):
        my_file = Path(self.dataFolder + self.saveFile)
        my_file_dict = Path(self.dataFolder + self.saveFileDict)
        return my_file.is_file() and my_file_dict.is_file()

    def update(self, corpus):
        self.ldamodel.update(corpus)

    def load(self, subfolder=None):
        if subfolder:
            sf = subfolder + '/'
        else:
            sf = ''
        self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile)
        self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)

    def fillDb(self):
        topics = {}
        result = []
        result2 = []
        nbTopics = self.ldamodel.get_topics().shape[0]
        # "Old"
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 3)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            words = []
            for topicTerm in topicTerms:
                words.append(self.dictionary.get(topicTerm[0]))
            topics[topicId] = ' '.join(words)
        with mysql as db:
            contentsText = db.getContentsText()
            for element in contentsText:
                bow = self.dictionary.doc2bow(element['content'].split())
                docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05)
                if len(docTopics) > 0:
                    docTopics.sort(key=lambda x: x[1], reverse=True)
                    result.append((element['url'], topics[docTopics[0][0]]))
                    for docTopic in docTopics:
                        result2.append((element['url'], docTopic[0], str(docTopic[1])))
            db.emptyUrlsTopic()
            db.emptyCurrentUrlsTopic()
            db.emptyCurrentUserTags()
            db.setCurrentUrlsTopic(result2)
            db.setPrecalcTopics()
        # "New"
        terms = []
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 5)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            for topicTerm in topicTerms:
                terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1])))
        with mysql as db:
            db.emptyLdaTopics()
            db.setLdaTopics(terms)


    def get_terms_topics(self, keywords):
        bow = self.dictionary.doc2bow(keywords[:30])
        topics = {}
        keywordsResult = {}
        for word in bow:
            wordTopics = self.ldamodel.get_term_topics(word[0], 0.05)
            keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics}
            for wordTopic in wordTopics:
                wordTopicId = wordTopic[0]
                if wordTopicId not in topics:
                    topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId)
        return {'topics': topics, 'keywords': keywordsResult}