Ejemplo n.º 1
0
def exec_lda():
    client = MongoClient()
    db = client.epistemonikos_files

    num_topics = range(2, 51)
    files = range(12)  # Número de combinaciones posibles de preprocesamiento

    # TODOS los documentos
    for f in files:
        for t in num_topics:
            data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f))
            corpus_cursor = CorpusCursor(data_cursor)
            lda = LdaMulticore(corpus=corpus_cursor,
                               id2word=corpus_cursor.dictionary, num_topics=t)
            lda.save(
                'processing_data/lda/all_docs/preprocess_{0}_topics_{1}'.format(
                    f, t))

    # Para cada TIPO de documento
    config_list = utils.create_config_list(
        'processing_data/lda/types/config_list.json')

    for i, config in enumerate(config_list):
        for f in files:
            for t in num_topics:
                data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f),
                                           **config)
                corpus_cursor = CorpusCursor(data_cursor, config=i)
                lda = LdaMulticore(corpus=corpus_cursor,
                                   id2word=corpus_cursor.dictionary,
                                   num_topics=t)
                lda.save(
                    'processing_data/lda/types/preprocess_{0}-topics_{1}-config_{2}'.format(
                        f, t, i))
Ejemplo n.º 2
0
    def main(self):
        print('Loading data')
        data = pd.read_csv('../../resources/abcnews-date-text.csv', error_bad_lines=False)
        data_text = data[['headline_text']]
        data_text['index'] = data_text.index
        documents = data_text

        np.random.seed(2018)

        print('Preprocessing text')
        preprocessed_docs = documents['headline_text'].map(self.preprocess)

        print('Building bag of words corpus')
        dictionary = Dictionary(preprocessed_docs)   # list: token_id, token
        dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
        bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]     # list: token_id, token_count

        print(documents[documents['index'] == 4310].values[0][0])
        print(bow_corpus[4310])
        print(bow_corpus[:100])

        print('Building lda model from bag of words')
        lda_model_bow = LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, workers=self.workers)
        for idx, topic in lda_model_bow.print_topics(-1):
            print('Topic: {} \nWords: {}'.format(idx, topic))

        for index, score in sorted(lda_model_bow[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(score, lda_model_bow.print_topic(index, 10)))

        print('Building tfidf corpus from bag of words corpus')
        tfidf = TfidfModel(bow_corpus)
        tfidf_corpus = tfidf[bow_corpus]
        from pprint import pprint
        for doc in tfidf_corpus:
            pprint(doc)
            break

        print('Building lda model from tfidf')
        lda_model_tfidf = LdaMulticore(tfidf_corpus, num_topics=10, id2word=dictionary, workers=self.workers)
        for idx, topic in lda_model_tfidf.print_topics(-1):
            print('Topic: {} Word: {}'.format(idx, topic))

        for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

        print('Testing on unseen document')
        unseen_document = 'Facebook’s global lobbying against data privacy laws'
        bow_vector = dictionary.doc2bow(self.preprocess(unseen_document))

        print('Bow:')
        for index, score in sorted(lda_model_bow[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, lda_model_bow.print_topic(index, 5)))

        print('TfIdf:')
        for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
Ejemplo n.º 3
0
 def load_topic_model(self):
     if not hasattr(self, "word2id"):
         self.load_globel_vocab()
     self.vectorizer = CountVectorizer(vocabulary=self.word2id,
                                       tokenizer=lambda x: x,
                                       preprocessor=lambda x: x)
     file_path = "./preproc_data/topic_model.pkl"
     if os.path.exists(file_path):
         self.topic_model = LdaModel.load(file_path)
     else:
         texts = []
         if not hasattr(self, "domain2data"):
             self.load_domain2data()
         for domain in self.domain2data:
             texts.extend(self.domain2data[domain]["labeled"])
             texts.extend(self.domain2data[domain]["unlabeled"])
         corpus = self.vectorizer.fit_transform(texts)
         corpus = Sparse2Corpus(corpus, documents_columns=False)
         self.topic_model = LdaMulticore(
             corpus=corpus,
             num_topics=self.num_topics,
             id2word=self.id2word,
             iterations=self.num_topic_iterations,
             passes=self.num_topic_passes)
         self.topic_model.save(file_path)
Ejemplo n.º 4
0
def arun(corpus, dictionary, min_topics=10, max_topics=100, step=10):
    l = np.array([sum(cnt for _, cnt in doc) for doc in corpus])
    
    kl = []
    for n in range(min_topics, max_topics+step, step):
        print("starting multicore LDA for num_topics={}".format(n))
        st = time.clock()
        lda = LdaMulticore(corpus=corpus,
                           id2word=vocabulary,
                           num_topics=n,
                           passes=20,
                           workers=mp.cpu_count()-1)
        el = time.clock()-st
        print("multicore LDA finished in {:.2f}s!".format(el))
        
        m1 = lda.expElogbeta
        _, cm1, _ = np.linalg.svd(m1)
        
        lda_topics = lda[corpus]
        m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose()
        cm2 = l.dot(m2)
        cm2 = cm2 + 0.0001
        cm2norm = np.linalg.norm(l)
        cm2 = cm2/cm2norm
        kl.append(sym_kl(cm1, cm2))
        
    return kl
Ejemplo n.º 5
0
def multicore():
    # tfidfで重みが大きいもののセット
    words = pickle.loads(open('tmp/words.pkl', 'rb').read())

    documents = []
    for name in glob.glob('tmp/wakati/*'):
        terms = open(name).read().split()
        terms = [term for term in terms if term in words]
        documents.append(terms)

    term_index = {}
    for terms in documents:
        for term in terms:
            # wordsに入っていない単語は見ない
            if term not in words:
                continue
            if term_index.get(term) is None:
                term_index[term] = len(term_index)

    open('tmp/pickles/term_index.pkl', 'wb').write(pickle.dumps(term_index))
    gensim_corpus = []

    for document in documents:
        tf = dict(Counter(document))
        doc = [(term_index[t], f) for t, f in tf.items()]
        gensim_corpus.append(doc)
    print('start to fit lda distribute...')
    model = LdaMulticore(gensim_corpus, workers=8, num_topics=TOPICN)
    open('tmp/pickles/model.pkl', 'wb').write(pickle.dumps(model))

    print('finish to learn')
Ejemplo n.º 6
0
def train(corpus, dct, docs, ids, num_topics, field):
    model_dir = f'./models/{field}/k_{num_topics}/'
    os.makedirs(model_dir, exist_ok=True)

    model_file = model_dir + 'model'
    # The filename is the file that will be created with the log.
    # If the file already exists, the log will continue rather than being overwritten.
    log_file = model_dir + 'model_callbacks.log'
    logging.basicConfig(filename=log_file,
                        format="%(asctime)s:%(levelname)s:%(message)s",
                        level=logging.NOTSET)

    lda_model = LdaMulticore(
        corpus=corpus,
        id2word=dct,
        random_state=2020,
        num_topics=num_topics,
        #                          passes=100,
        chunksize=5000,
        #                          batch=False,
        alpha='asymmetric',
        decay=0.5,
        offset=64,
        eta='auto',
        eval_every=0,
        iterations=10,
        #                            gamma_threshold=0.001,
        per_word_topics=True)

    lda_model.save(model_file)

    return lda_model
Ejemplo n.º 7
0
    def __createbasemodel(self):
        print('Creating base model')
        #Topics	Alpha	Beta	Coherence
        #6	asymmetric	symmetric	0.723863804

        self.__model = LdaMulticore(corpus=self.corpus_tfidf,
                                    id2word=self.id2word,
                                    num_topics=6,
                                    alpha='asymmetric',
                                    eta='symmetric',
                                    workers=2,
                                    random_state=100,
                                    chunksize=100,
                                    passes=10,
                                    per_word_topics=True)
        if self.__config['Storemodel']:
            self.__savemodel()
        print(self.__model.print_topics())
        print(self.__model[self.gensim_bow])
        print('calculating coherence')
        #__cohe_model = CoherenceModel(model=self.__model,texts=self.processeddata,dictionary=self.id2word,coherence='c_v')
        __cohe_model = CoherenceModel(model=self.__model,
                                      corpus=self.corpus_tfidf,
                                      coherence='u_mass')
        __cohe = __cohe_model.get_coherence()
        print('coherence :', __cohe)

        self._addMLflowMetric('BaseModel.Coherence', __cohe)
        return self.__model
Ejemplo n.º 8
0
    def build_models(self):
        documents_tokenized = []
        for doc in self.__document_list:
            processed_document = self.__preprocess_text_document(doc)
            if len(processed_document) > 0:
                documents_tokenized.append(processed_document)

        # if the documents get filtered out completely (by the intersection with the index),
        # add some random word to prevent exceptions
        if len(documents_tokenized) <= 0:
            documents_tokenized.append(['None'])

        # turn tokenized documents into a id <-> term dictionary
        self.__dictionary = Dictionary(documents_tokenized)

        # convert tokenized documents into a document-term matrix
        self.__corpus = [
            self.__dictionary.doc2bow(document)
            for document in documents_tokenized
        ]

        # generate models
        self.__model_tfidf = TfidfModel(corpus=self.__corpus)
        self.__model_lsi = LsiModel(corpus=self.__corpus,
                                    num_topics=self.topics_number)
        self.__model_lda = LdaMulticore(corpus=self.__corpus,
                                        num_topics=self.topics_number,
                                        id2word=self.__dictionary,
                                        workers=cpu_count() - 1,
                                        chunksize=2000,
                                        passes=1,
                                        batch=False)
Ejemplo n.º 9
0
def train_lda():
	"""
	Usage: python Wechat_LDA.py wechat.csv
	"""
	with open(sys.argv[1], 'r') as wx:
		for f in wx:
			seg = jieba.cut(f)
			seg = [word for word in seg if word not in stopwords]
			with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg:
				wx_seg.write(' '.join(seg))

	documents = open('wechat_seg.txt', 'r')
	dictionary = corpora.Dictionary(LineSentence(documents))
	corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)]
	tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True)
	tfidf_model.save('wechat_seg.txt.tfidf_model')
	# corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus])
	lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1)
	lda_model.save('wechat_lda_model.pkl')

	topics = []
	for doc in corpus:
		topics.append(lda_model[doc])

	counts = np.zeros(100)
	for top_doc in topics:
		for ti, _ in top_doc:
			counts[ti] += 1

	words = lda_model.show_topic(counts.argmax(), 64)
	with open('top_words.txt', 'w') as tw:
		writer = UnicodeWriter(tw)
		for w in words:
			writer.writerow((w[0], int(float(w[1])*1000)))
Ejemplo n.º 10
0
    def get_coherence_values(self):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics

        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        for i in range(*self.json['num_topics']):  # i = topic number
            for j in range(*self.json['num_passes']):  # j = pass number
                for corpus_type in self.corpuses['corpus_type']:
                    sys.stdout.write(
                        '\r Building model: topic # {} - pass # {} - {} corpus'
                        .format(i, j, corpus_type))
                    model = LdaMulticore(corpus=corpus_type,
                                         id2word=self.dictionary,
                                         num_topics=i,
                                         passes=j)
                    self.models['model'][(i, j)] = model
                    self.models['c_v'][(i, j)] = CoherenceModel(
                        model=model,
                        texts=corpus_type,
                        dictionary=self.dictionary,
                        coherence='c_v')
Ejemplo n.º 11
0
    def load_model(self, phrase):
        processed_phrase = self.preprocessing(phrase)

        self.all_phrases.append(processed_phrase)
        # print(self.all_phrases)
        # dct = Dictionary(common_texts)
        dct = Dictionary(self.all_phrases)
        corpus = [dct.doc2bow(line) for line in self.all_phrases]
        lda_model = LdaMulticore(corpus=corpus,
                                 id2word=dct,
                                 random_state=100,
                                 num_topics=3,
                                 passes=10,
                                 chunksize=1000,
                                 batch=False,
                                 alpha="asymmetric",
                                 decay=0.5,
                                 offset=64,
                                 eta=None,
                                 eval_every=0,
                                 iterations=100,
                                 gamma_threshold=0.001,
                                 per_word_topics=True)

        topic_keywords = []
        topics = lda_model.print_topics(-1)

        for topic in topics[:3]:
            topics_str = topic[1]
            pattern = r"[^a-zA-Z+]"
            topics_list = re.sub(pattern, "", topics_str).split("+")
            topic_keywords += topics_list[:5]

        return topic_keywords
Ejemplo n.º 12
0
    def train(self, num_topics, chunksize=10000, passes=6, iterations=40, eval_every=40):
      fmodel = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass"
#       logging.basicConfig(filename=fmodel + ".log",
#                     format="%(asctime)s:%(levelname)s:%(message)s",
#                     level=logging.INFO)
      
      temp = self.dictionary[0] 
      id2word = self.dictionary.id2token 
      model = LdaMulticore( corpus=self.corpus,
                            id2word=id2word,
                            chunksize=chunksize,
                            iterations=iterations,
                            num_topics=num_topics,
                            passes=passes,
                            eval_every=eval_every)
      model.save(fmodel + ".pt")
      self.model = model

#       p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
#       matches = [p.findall(l) for l in open(fmodel+'.log')]
#       matches = [m for m in matches if len(m) > 0]
#       tuples = [t[0] for t in matches]
#       perplexity = [float(t[1]) for t in tuples]
#       liklihood = [float(t[0]) for t in tuples]
#       iter = list(range(0,len(tuples)*10,10))
#       plt.plot(iter,liklihood,c="black")
#       plt.ylabel("log liklihood")
#       plt.xlabel("iteration")
#       plt.title("Topic Model Convergence")
#       plt.grid()
#       plt.savefig(fmodel + ".pdf") 
      
      return model
def updateLDA():
    api_file="./newsapi.key"
    categories=['business', 'entertainment', 'general', 'health', 'science', 'sports', 'technology']
    
    with open(api_file,"r") as apikey:
        newsapi=NewsApiClient(api_key=apikey.read().strip())
    
    headlines={cat:newsapi.get_top_headlines(category=cat, language='en', country='in') for cat in categories}
    pp_docs=[]
    
    for category in headlines:
        for article in headlines[category]['articles']:
            #print(lemma_pp(article['title']))
            pp_docs.append(lemma_pp(article['title']))
            
            
    if os.path.exists(MODEL_DIR+"corpus_dict.model"):
        corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model")
        corp_d.add_documents(pp_docs)
    else:
        corp_d = Dictionary(pp_docs)
        corp_d.filter_extremes(no_below=2, no_above=0.5)
    
    
    dtm=[corp_d.doc2bow(doc) for doc in pp_docs]
    
    tfidf=TfidfModel(dtm)
    corp_tfidf=tfidf[dtm]
    
    lda = LdaMulticore(corp_tfidf, num_topics=5, id2word=corp_d, passes=60, workers=3)
    print(lda.print_topics(num_topics=5, num_words=5))
    checkdir(MODEL_DIR)
    corp_d.save(MODEL_DIR+"corpus_dict.model")
    #corp_tfidf.save(MODEL_DIR+"corpus_tfidf.model")
    lda.save(MODEL_DIR+"lda.model")
Ejemplo n.º 14
0
    def createbasemodel(self):
        print('Creating base model')
        #Topics	Alpha	Beta	Coherence
        #6	asymmetric	symmetric	0.723863804

        self.__model = LdaMulticore(corpus=self.__data.corpus_tfidf,
                                    id2word=self.__data.id2word,
                                    num_topics=6,
                                    alpha='asymmetric',
                                    eta='symmetric',
                                    workers=2,
                                    random_state=100,
                                    chunksize=100,
                                    passes=10,
                                    per_word_topics=True)
        print(self.__model.print_topics())
        print(self.__model[self.__data.gensim_bow])
        print('calculating coherence')
        __cohe_model = CoherenceModel(model=self.__model,
                                      texts=self.__data.processeddata,
                                      dictionary=self.__data.id2word,
                                      coherence='c_v')
        __cohe = __cohe_model.get_coherence()
        print('coherence :', __cohe)
        #print('hyper param tuning')
        #self.__hyperparamtunning()
        print('saving model')
        self.__savemodel()
Ejemplo n.º 15
0
 def train_tfidf(self,num_topics=12):
     dictionary = corpora.Dictionary(self.df)
     corpus = [dictionary.doc2bow(doc) for doc in self.df]
     tfidf = TfidfModel(corpus)
     corpus_tfidf = tfidf[corpus]
     lda_model = LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=2,workers=2)        
     return dictionary, corpus_tfidf, lda_model,tfidf
def compute_coherence_lda(corpus,
                          dictionary,
                          start=None,
                          limit=None,
                          step=None):
    """Compute c_v coherence for various number of topics """
    topic_coherence = []
    model_list = []
    tokens_list = df.trigram_tokens.values.tolist()
    texts = [[token for sub_token in tokens_list for token in sub_token]]
    for num_topics in range(start, limit, step):
        model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            eta='auto',
            workers=4,
            passes=20,
            iterations=100,
            random_state=42,
            eval_every=None,
            alpha=
            'asymmetric',  # shown to be better than symmetric in most cases
            decay=0.5,
            offset=64  # best params from Hoffman paper
        )
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        topic_coherence.append(coherencemodel.get_coherence())

    return model_list, topic_coherence
Ejemplo n.º 17
0
    def train_lda(self, path):
        """
        https://www.cnblogs.com/Luv-GEM/p/10881838.html
        gensim 所需要的输入格式为:['教授', '长江', '学者', '优秀成果', '集中', '呈现'],也就是每段文章 text 是一个列表,元素为词语。
        然后构建语料库,再利用语料库把每篇新闻进行数字化,corpus 就是数字化后的结果。
        第一段文章 text ID 化后的结果为 corpus[0]:[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), ...],每个元素是 text 中的每个词语的 ID 和频率。
        最后训练 LDA 模型。LDA是一种无监督学习方法,我们可以自由选择主题的个数。num_topics = 30
        """

        print('train lda')
        corpus_data = get_corpus(path, w2v=True)
        id2word = gensim.corpora.Dictionary(corpus_data)
        corpus = [id2word.doc2bow(text) for text in corpus_data]

        # corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]
        # corpus是把每条ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率

        LDAmodel = LdaMulticore(corpus=corpus,
                                     id2word=id2word,
                                     num_topics=30,
                                     workers=4,
                                     chunksize=4000,
                                     passes=7,
                                     alpha='asymmetric')
        return LDAmodel
Ejemplo n.º 18
0
def lda_matrix(matrix_id, preprocess, topics, data_path):

    data_cursor, corpus_cursor = get_file_cursor(matrix_id, preprocess, data_path)
    lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary,
                       num_topics=topics)
    lda.save(os.path.join(data_path, '{2}-preprocess_{0}-topics_{1}.lda'.format(
                preprocess, topics, matrix_id)))
    return lda, corpus_cursor, data_cursor
Ejemplo n.º 19
0
 def __buildLDA(self, num_topics, chunksize, passes):
     self.__model = LdaMulticore(self.__corpus,
                                 id2word=self.__corpus.getDictionary(),
                                 num_topics=num_topics,
                                 chunksize=chunksize,
                                 passes=passes,
                                 eval_every=None,
                                 workers=40,
                                 random_state=10)