Beispiel #1
0
 def _run(self, info):
     nbprint('Running LDA')
     vocab = data.load_vocab(info)
     id2word = {e['id']: e['token'] for e in vocab}
     corpus = Sparse2Corpus(self.input_mat)
     lda = LdaModel(corpus, id2word=id2word, num_topics=info["num_topics"])
     self.W = lda.get_topics().T
     self.H = np.zeros((info["num_topics"], self.input_mat.shape[1]))
     for idx, doc in enumerate(corpus):
         weights = lda[doc]
         for topic, value in weights:
             self.H[topic, idx] = value
	def make_model(self, texts, dictionary, corpus, num_topics, chunksize=1000, iterations=400, passes=40):
		model = LdaModel(corpus=corpus,
					id2word=dictionary,
 					num_topics=num_topics,
				 	random_state=0,
					chunksize=chunksize,
 					iterations=iterations,
				 	passes=passes,
				 	alpha='asymmetric',
				 	per_word_topics=True)
		coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
		coh = coherencemodel.get_coherence()
		topic_matrix = model.get_topics()
		return model, coh
Beispiel #3
0
 def gen_ldamodel(self):
     mdf = MyDataFrame()
     df = mdf.new_DataFrame()
     df2 = mdf.m_cut(df)
     filelist=[]
     for i in range(len(df2)):
         filelist.append(df2['fenci'][i])
     #生成文档对应的字典和bow稀疏矩阵
     
     dictionary = corpora.Dictionary(filelist)  
     corpus = [dictionary.doc2bow(text) for text in filelist] # 仍为list in list  
     
     tfidf_model = models.TfidfModel(corpus) # 建立TF-IDF模型  
     corpus_tfidf = tfidf_model[corpus] # 对所需文档计算TF-IDF结果
     corpus_tfidf
     
     #拟合LDA模型
     from gensim.models.ldamodel import LdaModel
     # 列出所消耗的时间备查
     ldamodel = LdaModel(corpus, id2word = dictionary, num_topics = 10, passes = 10) 
     #列出最重要的若干个主题
     ldamodel.print_topics(num_topics = 20,num_words = 10)
     
     #计算各语料的LDA模型值
     corpus_lda = ldamodel[corpus_tfidf] # 此处应当使用和模型训练时相同类型的矩阵
     for doc in corpus_lda:
         print(doc)
     ldamodel.get_topics()#list of list 每个主题中每个词所对应的一个概率矩阵
     
     # 检索和文本内容最接近的主题
      # 检索和0.txt最接近的主题
     query_bow = dictionary.doc2bow(df2['fenci'][0]) # 频数向量
     query_tfidf = tfidf_model[query_bow] # TF-IDF向量
     print("转换后:", query_tfidf[:10])
     ldamodel.get_document_topics(query_bow) # 需要输入和文档对应的bow向量
     # 检索和文本内容最接近的主题       
     ldamodel[query_tfidf] #list,最接近的主题list
Beispiel #4
0
class Lda:
    def __init__(self):
        self.model = None
        self.common_dictionary = None
        pass

    def train(self, common_texts, num_topics):
        self.common_dictionary = Dictionary(common_texts)
        common_corpus = [
            self.common_dictionary.doc2bow(text) for text in common_texts
        ]
        self.model = LdaModel(common_corpus,
                              num_topics=num_topics,
                              alpha='auto',
                              eval_every=5)

    def get_topics(self, words=None):
        s = self.model.get_topics().T
        if words is not None:
            common_corpus = self.common_dictionary.doc2idx(words)
            s = s[common_corpus]
        return s
Beispiel #5
0
def lda(fname,
        indF,
        nTopics=20,
        passes=1,
        iterations=50,
        fmax=math.inf,
        fmin=0,
        head='cancer_py_gen_gvLDA_'):
    cts = pd.read_csv(fname + '.csv', header=0, index_col=0, dtype={0: str})
    # cts = pd.read_csv(fname + '.csv', header=None, index_col=None)
    ind = pd.read_csv(indF + '.csv', header=None)
    patID = cts.index
    gvID = cts.columns
    rows = np.where(ind > 0)[0]
    splits = np.max(np.array(ind))
    patID = patID[rows]
    phi = cts.iloc[rows]
    ind = ind.iloc[rows, 0]

    for i in range(1, splits + 1):

        # initialize log
        ofname = head + str(nTopics) + '_' + str(i)
        ch = logging.FileHandler('logs/' + ofname + '.log', mode='w')
        ch.setLevel(logging.INFO)
        formatter = logging.Formatter('%(levelname)s : %(message)s')
        ch.setFormatter(formatter)
        logger.addHandler(ch)

        # training set
        rowsT = np.where(ind != i)
        X = np.asarray(phi.iloc[rowsT])
        cols = np.logical_and(fmin < X.sum(axis=0), X.sum(axis=0) < fmax)
        X = X[:, cols]
        X_corp = Dense2Corpus(np.array(X), documents_columns=False)

        # valid set
        rowsV = np.where(ind == i)
        X_test = np.asarray(phi.iloc[rowsV])
        X_test = X_test[:, cols]
        X_testcorp = Dense2Corpus(np.array(X_test), documents_columns=False)

        lda = LdaModel(X_corp,
                       nTopics,
                       alpha='auto',
                       passes=passes,
                       iterations=iterations)
        ofname = 'data/' + ofname
        lda.save(ofname + '_model')
        gvTop = pd.DataFrame(lda.get_topics())
        gvTop.columns = np.asarray(gvID)[cols]
        gvTop.to_csv(ofname + '_genes.csv')
        pd.DataFrame(lda.alpha).to_csv(ofname + '_alpha.csv')
        patTop = pd.DataFrame(get_doc_topic(X_corp, lda))
        patTop.index = patID[rowsT]
        patTop.to_csv(ofname + '_train.csv')
        patTop = pd.DataFrame(get_doc_topic(X_testcorp, lda))
        patTop.index = patID[rowsV]
        patTop.to_csv(ofname + '_valid.csv')

        logger.removeHandler(ch)  # stop log
Beispiel #6
0
def get_most_common(title_list,
                    dic,
                    num=COMMON_TOPIC_WORDS_NUM,
                    random_state=None):
    '''最頻出の話題の単語num個のセットを取得する'''

    bow = [dic.doc2bow(title) for title in title_list]
    # TODO: 適切なトピック数を取得して設定する
    if LOG_LEVEL == 'DEBUG':
        random_state = 123
    model = LdaModel(bow,
                     id2word=dic,
                     num_topics=TOPIC_NUM,
                     random_state=random_state)
    # 各タイトルを分類
    topic_id_list = []
    for idx, title in enumerate(title_list):
        logger.debug('title')
        logger.debug(title)
        doc_topics_tuple = model.get_document_topics(dic.doc2bow(title),
                                                     minimum_probability=0.0)
        doc_topic_dist = [[val[0], val[1]] for val in doc_topics_tuple]
        doc_topic_dist = np.array(doc_topic_dist)
        if idx == 0:
            topic_dist_arr = doc_topic_dist
        else:
            topic_dist_arr = np.vstack([topic_dist_arr, doc_topic_dist])
        topic_id = int(
            sorted(doc_topic_dist, key=lambda x: x[1], reverse=True)[0][0])
        topic_id_list.append(topic_id)
    if LOG_LEVEL == 'DEBUG':
        # titleごとのトピック分布
        df_topic_dist = pd.DataFrame({
            'title': title_list,
            'topic_id': topic_id_list
        })
        # トピックごとの単語分布
        cols = ['{}_{}'.format(word_no, elem) \
                for word_no in range(10) \
                    for elem in range(2)]
        df_word_dist = pd.DataFrame()
        arr_dist = topic_dist_arr.reshape(-1, model.get_topics().shape[0], 2)
        for topic_id in range(model.get_topics().shape[0]):
            df_topic_dist['topic_{}'.format(topic_id)] = arr_dist[:, topic_id,
                                                                  1]
            topic_terms = model.get_topic_terms(topic_id,
                                                topn=int(len(cols) / 2))
            topic_terms_2 = []
            for term in topic_terms:
                topic_terms_2 = topic_terms_2 + [
                    dic.id2token[term[0]], term[1]
                ]
            df_word_dist = df_word_dist.append(
                pd.Series(topic_terms_2, name='topic_{}'.format(topic_id)))
        df_topic_dist.to_csv(
            os.path.join('test', 'classified_topic_{}.csv' \
                .format(datetime.today().strftime(format='%Y%m%d'))),
            index=False,
            encoding='cp932'
        )
        df_word_dist.columns = cols
        df_word_dist.to_csv(
            os.path.join('test', 'word_distribution_per_topic_{}.csv' \
                .format(datetime.today().strftime(format='%Y%m%d'))),
            encoding='cp932'
        )
    # 最頻出の話題を取得
    topic_id_counter = Counter(topic_id_list)
    most_common_topic_id = topic_id_counter.most_common(1)[0][0]
    topic_terms = model.get_topic_terms(most_common_topic_id)
    logger.debug('')
    logger.debug('topic_id_counter: ' + str(topic_id_counter))
    logger.debug('most_common_topic_id: ' + str(most_common_topic_id))
    logger.debug(topic_terms)
    # 最頻出の話題の重要な単語num個を取得
    important_word_list = [
        dic.id2token[topic_tuple[0]] for topic_tuple in topic_terms[:num]
    ]
    logger.debug(important_word_list)
    return important_word_list
Beispiel #7
0
class GensimLDA:
    def __init__(self, texts):
        self.dictionary = Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]

        self.k_topics = None
        self.model = None

    def fit(self, k_topics, iterations=50):
        ''''''
        self.k_topics = k_topics
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, \
            num_topics=k_topics, iterations=iterations)

    def get_document_topic_matrix(self, X=None):
        '''Returns an n_docs x k_topics array of probabilities
        of a topic in a given document.'''
        if X is None:
            X = self.corpus
        else:
            X = [self.dictionary.doc2bow(text) for text in X]

        n_docs = len(X)
        V = np.zeros((n_docs, self.k_topics))

        # Extract assignments
        some_iterable = self.model.get_document_topics(
            X)  ## equiv: self.model[X]
        for i, doc_topic in enumerate(some_iterable):
            for topic_id, prob in doc_topic:
                V[i, topic_id] = prob
        return V

    def get_topic_term_matrix(self):
        '''Returns an k_topics x m_words array of probabilities
        of a word in a given topic.'''
        return self.model.get_topics()

    def print_topics(self, top_n=10):
        '''Prints the top_n words in a topic'''
        for row in self.get_topic_term_matrix():
            ranking = np.argsort(row)
            ids = np.arange(len(ranking))[ranking]

            for k in ids[:-top_n:-1]:
                weight = row[k]
                word = self.dictionary.id2token[k]
                print(k, word, weight)
            print()

    def print_topic_words(self, topic_num, topn=None):
        '''Prints the top words and probabilities of a given topic in
        descending probability.'''
        for tok_id, prob in self.model.get_topic_terms(topic_num, topn=topn):
            word = self.dictionary.id2token[tok_id]
            print(word, prob)

    def get_topic_bows(self, num_words=10):
        '''Returns a list (for each topic) containing a list of the top num_words'''
        q = self.model.show_topics(num_topics=self.k_topics,
                                   num_words=num_words,
                                   formatted=False)
        topics = []
        for id, topic in q:
            words = []
            for w, p in topic:
                words.append(w)
            topics.append(words)
        return topics
Beispiel #8
0
class LDAWDF:
    mysql: mysql.MySQL
    ldamodel: LdaModel
    dictionary = None
    corpus = None

    def __init__(self, mysql):
        self.mysql = mysql
        self.dataFolder = './data/'
        self.saveFile = 'lda_model'
        self.saveFileDict = 'lda_model_dict'

    def trainFromStart(self):
        with self.mysql as db:
            content = db.getContentsText()
        documents = []
        for item in content:
            documents.append(item['content'].split())

        self.dictionary = corpora.Dictionary(documents)

        self.dictionary.filter_extremes(no_below=5, no_above=0.5)

        doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents]

        self.corpus = doc_term_matrix

        # Running and Training LDA model on the document term matrix.
        print("Starting to train LDA Model...")
        self.ldamodel = LdaModel(
            doc_term_matrix,
            num_topics=200,
            id2word=self.dictionary,
            passes=100)

    def printTest(self):
        print(self.ldamodel.print_topics(num_topics=10, num_words=5))

    def save(self):
        self.ldamodel.save(self.dataFolder + self.saveFile)
        self.dictionary.save(self.dataFolder + self.saveFileDict)

    def canLoad(self):
        my_file = Path(self.dataFolder + self.saveFile)
        my_file_dict = Path(self.dataFolder + self.saveFileDict)
        return my_file.is_file() and my_file_dict.is_file()

    def update(self, corpus):
        self.ldamodel.update(corpus)

    def load(self, subfolder=None):
        if subfolder:
            sf = subfolder + '/'
        else:
            sf = ''
        self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile)
        self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)

    def fillDb(self):
        topics = {}
        result = []
        result2 = []
        nbTopics = self.ldamodel.get_topics().shape[0]
        # "Old"
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 3)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            words = []
            for topicTerm in topicTerms:
                words.append(self.dictionary.get(topicTerm[0]))
            topics[topicId] = ' '.join(words)
        with mysql as db:
            contentsText = db.getContentsText()
            for element in contentsText:
                bow = self.dictionary.doc2bow(element['content'].split())
                docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05)
                if len(docTopics) > 0:
                    docTopics.sort(key=lambda x: x[1], reverse=True)
                    result.append((element['url'], topics[docTopics[0][0]]))
                    for docTopic in docTopics:
                        result2.append((element['url'], docTopic[0], str(docTopic[1])))
            db.emptyUrlsTopic()
            db.emptyCurrentUrlsTopic()
            db.emptyCurrentUserTags()
            db.setCurrentUrlsTopic(result2)
            db.setPrecalcTopics()
        # "New"
        terms = []
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 5)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            for topicTerm in topicTerms:
                terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1])))
        with mysql as db:
            db.emptyLdaTopics()
            db.setLdaTopics(terms)


    def get_terms_topics(self, keywords):
        bow = self.dictionary.doc2bow(keywords[:30])
        topics = {}
        keywordsResult = {}
        for word in bow:
            wordTopics = self.ldamodel.get_term_topics(word[0], 0.05)
            keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics}
            for wordTopic in wordTopics:
                wordTopicId = wordTopic[0]
                if wordTopicId not in topics:
                    topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId)
        return {'topics': topics, 'keywords': keywordsResult}
Beispiel #9
0
class Lda():
    def __init__(self,
                 corpora=None,
                 savedModle=None,
                 numTopics=10,
                 seed=None,
                 autoAproach=False):
        '''
            corpora: Corpora 結構化之文本數據
            saveModel: str = None 欲載入之model路徑
            numTopics: int = 10 欲生成之主題數量
            seed: int = None 使用特定亂數種子碼
            autoAproach = False 是否自動調整主題數目找出適當值
        '''
        self.corpora = corpora
        self.numTopics = numTopics
        self.seed = seed

        if (savedModle == None):
            self.__trainingModel()
        else:
            self.ldaModel = LdaModel.load(savedModle)

        if (autoAproach):
            wellLastTime = False
            while (self.__isWellClassify() or not wellLastTime):
                if (self.__isWellClassify()):
                    wellLastTime = True
                    savedModle(name="temp")
                    self.numTopics -= 1
                    self.__trainingModel()
                elif (not wellLastTime):
                    self.numTopics += 2
                    self.__trainingModel()
            # else:
            self.numTopics += 1
            LdaModel.load("temp.pkl")

    def __trainingModel(self):
        if (self.seed != None):
            self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair,
                                     id2word=self.corpora.Dictionary,
                                     num_topics=self.numTopics,
                                     random_state=np.random.RandomState(
                                         self.seed))
        else:
            self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair,
                                     id2word=self.corpora.Dictionary,
                                     num_topics=self.numTopics)

    def __isWellClassify(self, threshold=0.8, test=None):
        '''
            確認個文本至少有一主題之吻合度(概率)大於標準值
            threshold = 0.8: 最小接受之主題分佈(標準值)
            (test : 測試用的虛擬分佈)
        '''
        if (test == None):
            test = self.topicsDistribution()
        for tdb in test:
            ambiguous = True
            for prob in tdb:
                if (prob[1] >= threshold):
                    ambiguous = False
                    break
            if (ambiguous):
                return False
        return True

    def saveModel(self, name="my_model"):
        '''
            儲存訓練完成之model
            name: str = "my_modle" 儲存路徑
        '''
        self.ldaModel.save(fname=name)

    def showTopicsStr(self, topn=10):
        '''
            以字串顯示訓練lda主題
            topn: 欲顯示的詞彙個數
        '''
        return self.ldaModel.show_topics(num_topics=self.numTopics,
                                         num_words=topn)

    def showTopicsList(self, topn=10):
        '''
            以list of tuple 顯示主題
            topn: 欲顯示的詞彙個數
        '''
        return self.ldaModel.show_topics(num_topics=self.numTopics,
                                         num_words=topn,
                                         formatted=False)

    def topicsDistribution(self, tfidf=None):
        '''
        以該模型分析待定之結構化文檔
        Input:
            tfidf: 2d_list: tfidf矩陣
        output:
            2d_list: 文檔對各主題歸屬之概率
        '''
        if (tfidf == None):
            tfidf = self.corpora.TfidfPair
        return [self.ldaModel[article] for article in tfidf]

    def classifyTopic(self, topicsDistr=None):
        '''回傳存有文本對應主題之list'''
        if (topicsDistr == None):
            topicsDistr = self.topicsDistribution()
        result = []
        for article in topicsDistr:  #針對每一篇文章測試
            topicID = 0  #預設主題為0
            for topic in article:  #依序迭代每一主題
                if (topic[1] > article[topicID][1]):  #該則主題概率更高則取代預設
                    topicID = topic[0]
            result.append(topicID)
        return result

    def findArticleMatched(self, classifiedTopic=None):
        '''將文本依主題歸類後做成list回傳'''
        if (classifiedTopic == None):
            classifiedTopic = self.classifyTopic()
        numOfTopic = max(classifiedTopic) + 1
        result = [[] for num in range(0, numOfTopic)]
        counter = 0
        while (counter < len(classifiedTopic)):
            result[classifiedTopic[counter]].append(counter)  #把文章丟進對應的主題桶子
            counter += 1
        return result

    def __relativeEntropy(self, p, q):  #q編碼p所需額外位元
        '''sum(p*log(p/q))'''
        if (0 in q):
            return math.inf  #infinity
        return reduce(operator.add, map(lambda x, y: x * math.log(x / y), p,
                                        q))

    def showRelativeEntropy(self, topicId, dtMatrix):
        '''計算給定詞頻矩陣與該model之相對熵'''
        klMeans = list()
        p = self.ldaModel.get_topics()[topicId]
        #q
        candidatesIds = self.findArticleMatched()[topicId]  #取得歸類於給定主題之文本
        for id in candidatesIds:
            dtm = dtMatrix[id]
            totalWordCount = sum(dtm)  #取得文章總辭彙數用於醬詞頻轉為概率
            q = list()
            for prob in dtm:
                if (prob == 0):
                    q.append(1e-20)
                else:
                    q.append(prob / totalWordCount)
            klMeans.append((id, self.__relativeEntropy(p, q)))
        return klMeans

    def showAuthenticArticle(self, topicId, num=1):
        '''代表性文章'''
        entropy = self.showRelativeEntropy(topicId, self.corpora.DtMatrix)
        sortedEntropy = sorted(entropy, key=lambda x: x[1])
        return [t[0] for t in sortedEntropy[:num]]