Ejemplo n.º 1
0
    def createLDA(self, fileName = '', modelName= '', ldaPasses='', topicNum=''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName
            
        if ldaPasses == '':
            ldaPasses = self.__ldaPasses
    
        if topicNum == '':
            topicNum = self.__topicNum

        if modelName == '':
            modelName = fileName + '_' + str(ldaPasses) + 'P_' + str(topicNum) + 'T'
        
        dict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
        mm = corpora.MmCorpus(self.__destination+fileName+'.mm')
        
        #lda = models.ldamodel.LdaModel(corpus=mm, id2word=dict, num_topics=6, update_every=1, chunksize=10000, passes=10)
        lda = LdaMulticore(corpus=mm, num_topics=topicNum, id2word=dict, chunksize=30000, passes=ldaPasses, workers=3)
        lda.save(self.__destination+modelName+'.lda')
        #=======================================================================
        # print lda
        #=======================================================================
        print 'Created LDA model %s'%self.__fileName 
Ejemplo n.º 2
0
 def train(self):
     tfidf = TfidfModel(corpus)
     corpus_tfidf = tfidf[corpus]
     lda = LdaMulticore(corpus=corpus_tfidf,
                        id2word=dictionary,
                        num_topics=100)
     lda.save('lda.model')
    def getLDA(self):
        logging.info("Creating bag of words for LDA model")
        self.dictionary = gensim.corpora.Dictionary(self.corpus)
        self.dictionary.filter_extremes(no_below=2, no_above=0.1)
        self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in self.corpus]
        del self.corpus
        lda_models_coherence_cV = []

        for num_topics in tqdm(range(3, 13)):
            model_lda = LdaMulticore(corpus = self.bow_corpus,
                                 num_topics=num_topics,
                                 id2word=self.dictionary,
                                 workers=8)

            #coherencemodel = CoherenceModel(model=model_lda,
            #                                texts=self.corpus,
            #                                dictionary=self.dictionary,
            #                                coherence='c_v')

            #coherence_value = coherencemodel.get_coherence()
            #lda_models_coherence_cV.append(coherence_value)
            if "topic" not in os.listdir(self.path):
                os.mkdir(self.path+"/topic")
            model_lda.save(self.path+"/topic/lda_"+str(num_topics)+".model")
            #coherencemodel.save(self.path+"/topic/coherence_" + str(num_topics) + ".model")
            self.lda_models[num_topics] = model_lda

        return
Ejemplo n.º 4
0
def train(corpus, dct, docs, ids, num_topics, field):
    model_dir = f'./models/{field}/k_{num_topics}/'
    os.makedirs(model_dir, exist_ok=True)

    model_file = model_dir + 'model'
    # The filename is the file that will be created with the log.
    # If the file already exists, the log will continue rather than being overwritten.
    log_file = model_dir + 'model_callbacks.log'
    logging.basicConfig(filename=log_file,
                        format="%(asctime)s:%(levelname)s:%(message)s",
                        level=logging.NOTSET)

    lda_model = LdaMulticore(
        corpus=corpus,
        id2word=dct,
        random_state=2020,
        num_topics=num_topics,
        #                          passes=100,
        chunksize=5000,
        #                          batch=False,
        alpha='asymmetric',
        decay=0.5,
        offset=64,
        eta='auto',
        eval_every=0,
        iterations=10,
        #                            gamma_threshold=0.001,
        per_word_topics=True)

    lda_model.save(model_file)

    return lda_model
Ejemplo n.º 5
0
def lda_train(train_data, part, save_root):
    ids = list(train_data['id'])
    texts = list(train_data[part])

    with Pool() as pool:
        texts = list(
            tqdm.tqdm(pool.imap(tokenize, texts), total=len(texts), ncols=100))

    text_dictionary = Dictionary(texts)
    text_dictionary.save(os.path.join(save_root, 'dict'))

    with Pool(initializer=make_dictionary_global,
              initargs=(text_dictionary, )) as pool:
        texts = list(
            tqdm.tqdm(pool.imap(doc2bow_unit, texts),
                      total=len(texts),
                      ncols=100))

    lda_model = LdaMulticore(texts, workers=7)
    lda_model.save(os.path.join(save_root, 'model'))

    with Pool(initializer=make_model_global, initargs=(lda_model, )) as pool:
        rows = list(
            tqdm.tqdm(pool.imap(get_document_topics_unit, texts),
                      total=len(texts),
                      ncols=100))
    topics = pd.DataFrame(rows, columns=['topics', 'topic_num'])
    topics.insert(0, 'id', ids)
    topics.to_csv(os.path.join(save_root, 'train.csv'), index=False)

    return text_dictionary, lda_model
Ejemplo n.º 6
0
def train_lda():
	"""
	Usage: python Wechat_LDA.py wechat.csv
	"""
	with open(sys.argv[1], 'r') as wx:
		for f in wx:
			seg = jieba.cut(f)
			seg = [word for word in seg if word not in stopwords]
			with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg:
				wx_seg.write(' '.join(seg))

	documents = open('wechat_seg.txt', 'r')
	dictionary = corpora.Dictionary(LineSentence(documents))
	corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)]
	tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True)
	tfidf_model.save('wechat_seg.txt.tfidf_model')
	# corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus])
	lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1)
	lda_model.save('wechat_lda_model.pkl')

	topics = []
	for doc in corpus:
		topics.append(lda_model[doc])

	counts = np.zeros(100)
	for top_doc in topics:
		for ti, _ in top_doc:
			counts[ti] += 1

	words = lda_model.show_topic(counts.argmax(), 64)
	with open('top_words.txt', 'w') as tw:
		writer = UnicodeWriter(tw)
		for w in words:
			writer.writerow((w[0], int(float(w[1])*1000)))
Ejemplo n.º 7
0
def train(file=DATA_FILE, type=JSON):
    delete_previous_models()

    faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type)
    faq_df = clean_data(faq_df)
    faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess)
    faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess)
    print('Preprocessing Done')
    if DEBUG:
        print(faq_df.head())

    for mode in modes:
        model = modes[mode]
        dictionary = corpora.Dictionary(faq_df[model.column])
        dictionary.save(os.path.join(MODEL_DIR, model.dictionary))
        corpus = faq_df[model.column].map(dictionary.doc2bow)
        if DEBUG:
            print(f'{model.corpus} generated')
            print(corpus.head())
        corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus),
                                   corpus)
        tfidf_model = TfidfModel(corpus)
        if DEBUG:
            print(f'{model.tfidf} generated')
        tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf))
        tfidf = tfidf_model[corpus]
        lda_model = LdaMulticore(corpus=tfidf,
                                 id2word=dictionary,
                                 num_topics=30)
        lda_model.save(os.path.join(MODEL_DIR, model.model))
        if DEBUG:
            print(f'{model.model} generated')
            print(lda_model.print_topics(5))
    print('Training completed')
Ejemplo n.º 8
0
    def train(self, num_topics, chunksize=10000, passes=6, iterations=40, eval_every=40):
      fmodel = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass"
#       logging.basicConfig(filename=fmodel + ".log",
#                     format="%(asctime)s:%(levelname)s:%(message)s",
#                     level=logging.INFO)
      
      temp = self.dictionary[0] 
      id2word = self.dictionary.id2token 
      model = LdaMulticore( corpus=self.corpus,
                            id2word=id2word,
                            chunksize=chunksize,
                            iterations=iterations,
                            num_topics=num_topics,
                            passes=passes,
                            eval_every=eval_every)
      model.save(fmodel + ".pt")
      self.model = model

#       p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
#       matches = [p.findall(l) for l in open(fmodel+'.log')]
#       matches = [m for m in matches if len(m) > 0]
#       tuples = [t[0] for t in matches]
#       perplexity = [float(t[1]) for t in tuples]
#       liklihood = [float(t[0]) for t in tuples]
#       iter = list(range(0,len(tuples)*10,10))
#       plt.plot(iter,liklihood,c="black")
#       plt.ylabel("log liklihood")
#       plt.xlabel("iteration")
#       plt.title("Topic Model Convergence")
#       plt.grid()
#       plt.savefig(fmodel + ".pdf") 
      
      return model
def updateLDA():
    api_file="./newsapi.key"
    categories=['business', 'entertainment', 'general', 'health', 'science', 'sports', 'technology']
    
    with open(api_file,"r") as apikey:
        newsapi=NewsApiClient(api_key=apikey.read().strip())
    
    headlines={cat:newsapi.get_top_headlines(category=cat, language='en', country='in') for cat in categories}
    pp_docs=[]
    
    for category in headlines:
        for article in headlines[category]['articles']:
            #print(lemma_pp(article['title']))
            pp_docs.append(lemma_pp(article['title']))
            
            
    if os.path.exists(MODEL_DIR+"corpus_dict.model"):
        corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model")
        corp_d.add_documents(pp_docs)
    else:
        corp_d = Dictionary(pp_docs)
        corp_d.filter_extremes(no_below=2, no_above=0.5)
    
    
    dtm=[corp_d.doc2bow(doc) for doc in pp_docs]
    
    tfidf=TfidfModel(dtm)
    corp_tfidf=tfidf[dtm]
    
    lda = LdaMulticore(corp_tfidf, num_topics=5, id2word=corp_d, passes=60, workers=3)
    print(lda.print_topics(num_topics=5, num_words=5))
    checkdir(MODEL_DIR)
    corp_d.save(MODEL_DIR+"corpus_dict.model")
    #corp_tfidf.save(MODEL_DIR+"corpus_tfidf.model")
    lda.save(MODEL_DIR+"lda.model")
Ejemplo n.º 10
0
    def createLDA(self, fileName='', modelName='', ldaPasses='', topicNum=''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName

        if ldaPasses == '':
            ldaPasses = self.__ldaPasses

        if topicNum == '':
            topicNum = self.__topicNum

        if modelName == '':
            modelName = fileName + '_' + str(ldaPasses) + 'P_' + str(
                topicNum) + 'T'

        dict = corpora.Dictionary.load(self.__destination + fileName + '.dict')
        mm = corpora.MmCorpus(self.__destination + fileName + '.mm')

        #lda = models.ldamodel.LdaModel(corpus=mm, id2word=dict, num_topics=6, update_every=1, chunksize=10000, passes=10)
        lda = LdaMulticore(corpus=mm,
                           num_topics=topicNum,
                           id2word=dict,
                           chunksize=30000,
                           passes=ldaPasses,
                           workers=3)
        lda.save(self.__destination + modelName + '.lda')
        #=======================================================================
        # print lda
        #=======================================================================
        print 'Created LDA model %s' % self.__fileName
Ejemplo n.º 11
0
def exec_lda():
    client = MongoClient()
    db = client.epistemonikos_files

    num_topics = range(2, 51)
    files = range(12)  # Número de combinaciones posibles de preprocesamiento

    # TODOS los documentos
    for f in files:
        for t in num_topics:
            data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f))
            corpus_cursor = CorpusCursor(data_cursor)
            lda = LdaMulticore(corpus=corpus_cursor,
                               id2word=corpus_cursor.dictionary, num_topics=t)
            lda.save(
                'processing_data/lda/all_docs/preprocess_{0}_topics_{1}'.format(
                    f, t))

    # Para cada TIPO de documento
    config_list = utils.create_config_list(
        'processing_data/lda/types/config_list.json')

    for i, config in enumerate(config_list):
        for f in files:
            for t in num_topics:
                data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f),
                                           **config)
                corpus_cursor = CorpusCursor(data_cursor, config=i)
                lda = LdaMulticore(corpus=corpus_cursor,
                                   id2word=corpus_cursor.dictionary,
                                   num_topics=t)
                lda.save(
                    'processing_data/lda/types/preprocess_{0}-topics_{1}-config_{2}'.format(
                        f, t, i))
Ejemplo n.º 12
0
def lda_matrix(matrix_id, preprocess, topics, data_path):

    data_cursor, corpus_cursor = get_file_cursor(matrix_id, preprocess, data_path)
    lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary,
                       num_topics=topics)
    lda.save(os.path.join(data_path, '{2}-preprocess_{0}-topics_{1}.lda'.format(
                preprocess, topics, matrix_id)))
    return lda, corpus_cursor, data_cursor
Ejemplo n.º 13
0
def create_model(session, df, feature):
    print(f"Updating model for feature {feature}")
    freeze_support()
    dct = get_dict(feature, session)
    corpus = common.remove_stopwords(df[feature]).tolist()
    corpus = [doc.split() for doc in corpus]
    corpus = [dct.doc2bow(text) for text in corpus]
    dct = Dictionary.load(session + "LDA-dictionary-" + feature + ".pk1")
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dct,
                             workers=5,
                             iterations=1500,
                             alpha=0.01)
    lda_model.save(session + "LDA-model-" + feature)
Ejemplo n.º 14
0
class LDA(object):
    def __init__(self, max_workers, num_topics, passes, preprocessor=None):
        self.log = logging.getLogger('lda_model')
        self.passes = passes
        self.num_topics = num_topics
        self.max_workers = max_workers
        self.preprocessor = preprocessor if preprocessor is not None else Preprocessor(
            max_workers=max_workers)
        self.model, self.dictionary = None, None

    def train(self, doc_list):
        self.log.info('LDA.train called. Starting preprocessing %d documents',
                      len(doc_list))
        preprocessed_docs = self.preprocessor.process_docs(doc_list)

        self.log.info('Preprocessing ended. Building dictionary')
        self.dictionary = Dictionary(preprocessed_docs)

        self.log.info('Dictionary built with %d words. Building corpus',
                      len(self.dictionary))
        corpus = self.build_corpus(preprocessed_docs, self.dictionary)

        self.log.info(
            'Built corpus. Starting actual training with '
            '%d topics, %d workers, %d passes', self.num_topics,
            self.max_workers, self.passes)
        self.model = LdaMulticore(corpus,
                                  num_topics=self.num_topics,
                                  id2word=self.dictionary,
                                  workers=self.max_workers,
                                  passes=self.passes)

    def save_model(self, model_path):
        self.log.info('Saving LDA model to file: %s', model_path)
        self.model.save(model_path)

    def save_dictionary(self, dict_path):
        self.log.info('Saving dictionary to file: %s', dict_path)
        self.dictionary.save(dict_path)

    def build_corpus(self, doc_list, dictionary):
        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
            return list(executor.map(dictionary.doc2bow, doc_list))

    @staticmethod
    def with_url_handling(max_workers, num_topics, passes):
        return LDA(max_workers,
                   num_topics,
                   passes,
                   preprocessor=WithUrlPreprocessor(max_workers=max_workers))
Ejemplo n.º 15
0
 def run(self):
     data = pd.read_pickle(self.input().path)
     sentences = (data['question1'].str.split().tolist() +
                  data['question2'].str.split().tolist())
     dictionary = corpora.Dictionary(sentences)
     corpus = list(map(dictionary.doc2bow, sentences))
     lda = GensimLdaModel(corpus,
                          num_topics=self.num_topics,
                          id2word=dictionary,
                          chunksize=1000,
                          passes=self.passes,
                          minimum_probability=-1.0)
     lda_file, dictionary_file = self.output()
     lda.save(lda_file.path)
     dictionary.save(dictionary_file.path)
Ejemplo n.º 16
0
def calculate_keys(vol, n_top, n_pass, cache_corpus=True, cache_model=True):

    texts_path = "../arxiv/{0}/{1}/".format(vol.section, vol.year)

    if not os.path.isdir(texts_path):
        raise Exception('There is no such path: {}'.format(texts_path))

    files_list = shared.random_glob(texts_path, n_proc_articles)
    print(len(files_list))
    texts = prepare_sentences(files_list, n_proc_articles)
    print(len(texts))
    print("Searching for bigrams...")

    if config.biGram:
        bigram_transformer = Phrases(texts, min_count=10)
        texts = list(bigram_transformer[texts])

    texts = shared.plural_filter(texts)

    print("Building corpus..")
    dictionary = corpora.Dictionary(texts)

    dictionary.filter_extremes(no_below=20)

    corpus = [dictionary.doc2bow(text) for text in texts]

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    print("Running LDA...")
    lda = LdaMulticore(corpus,
                       num_topics=n_top,
                       id2word=dictionary,
                       workers=4,
                       passes=n_pass,
                       iterations=400,
                       eval_every=None)

    if cache_corpus:
        with open(config.lda_stat + "{0}.corpus".format(volume), 'wb') as f:
            pickle.dump(corpus, f)

        with open(config.lda_stat + "{0}.dict".format(volume), 'wb') as f:
            pickle.dump(texts, f)

    if cache_model:
        lda.save("{0}{1}".format(config.lda_stat, volume))
    return lda
Ejemplo n.º 17
0
def LDA(dictionnaire, corpus, nbtopic=5):
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dictionnaire,
                             random_state=100,
                             num_topics=nbtopic,
                             passes=10,
                             chunksize=1000,
                             batch=False,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)
    #  save the lda model
    lda_model.save('lda_model.model')
Ejemplo n.º 18
0
def train_lda_multicore(articles, n_topics, outfile="lda", workers=3):
    common_dictionary = corpora.Dictionary(articles)
    common_corpus = [common_dictionary.doc2bow(d) for d in articles]
    print("Documents: ", str(len(articles)))
    print("Vocabulary: ", len(common_dictionary))
    print("Topics: ", n_topics)
    print("Training LDA...")
    lda = LdaMulticore(common_corpus,
                       id2word=common_dictionary,
                       num_topics=n_topics,
                       workers=workers)

    model_file = "trained_models/" + outfile
    lda.save(model_file)
    dict_filename = model_file + "_dict.pkl"
    pickle.dump(common_dictionary, open(dict_filename, "wb"))
    dict_filename = model_file + "_corpus.pkl"
    pickle.dump(common_corpus, open(dict_filename, "wb"))
    print("Saved trained LDA model as", model_file, "!")
Ejemplo n.º 19
0
class LDA(object):
    def __init__(self, max_workers,
                 num_topics,
                 passes):
        self.passes = passes
        self.num_topics = num_topics
        self.max_workers = max_workers
        self.model, self.dictionary = None, None

    def train(self, preprocessed_docs):
        logger.info('Building dictionary')
        self.dictionary = Dictionary(preprocessed_docs)
        logger.info('Dictionary built with %d words. Building corpus', len(self.dictionary))
        corpus = self.build_corpus(preprocessed_docs, self.dictionary)
        logger.info('Built corpus. Starting actual training with '
                      '%d topics, %d workers, %d passes', self.num_topics, self.max_workers, self.passes)
        self.model = LdaMulticore(corpus,
                                  num_topics=self.num_topics,
                                  id2word=self.dictionary,
                                  workers=self.max_workers,
                                  passes=self.passes)

    def save_model(self, model_path):
        logger.info('Saving LDA model to file: %s', model_path)
        self.model.save(model_path)

    def save_dictionary(self, dict_path):
        logger.info('Saving dictionary to file: %s', dict_path)
        self.dictionary.save(dict_path)

    def build_corpus(self, doc_list, dictionary):
        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
            return list(executor.map(dictionary.doc2bow, doc_list))

    @staticmethod
    def with_url_handling(max_workers,
                          num_topics,
                          passes
                          ):
        return LDA(max_workers,
                   num_topics,
                   passes)
Ejemplo n.º 20
0
    def train(self, dataset):
        corpus, dictionary = self._prepare(dataset)
        dictionary.save('../models.nosync/lda/dict')

        print('starting LDA')
        model = LdaMulticore(
            corpus=corpus,
            # distributed=True,
            workers=3,
            id2word=dictionary.id2token,
            chunksize=4000,
            alpha=self.c.alpha,  # optimized alpha
            eta='auto',
            iterations=self.c.lda_iter,
            num_topics=self.c.lda_topics,
            passes=self.c.lda_passes,
            eval_every=5000)
        path = '../models.nosync/lda/model'
        model.save(path)
        return model, corpus
def lda3(corpus,dictionary):
  lda_model = LdaMulticore(corpus=corpus,
                          id2word=dictionary,
                          random_state=22,
                          num_topics=100,
                          passes=10,
                          chunksize=1000,
                          batch=False,
                          alpha='asymmetric',
                          decay=0.5,
                          offset=64,
                          eta=None,
                          eval_every=0,
                          iterations=100,
                          gamma_threshold=0.001,
                          per_word_topics=True)
  # save the model
  lda_model.save('lda_model.model')
  # See the topics
  for topic in lda_model.print_topics(100,20):
      print(topic)
Ejemplo n.º 22
0
def train_lda_multicore(corpus_bow, dictionary, topic_num, model_path):
    """
    多核训练
    :param corpus_bow: 语料
    :param dictionary: 词典
    :param topic_num: 主题数
    :param model_path: 模型保存位置
    :return:
    """
    start = time.time()
    print '开始训练: %d个主题' % topic_num
    model_lda = LdaMulticore(corpus=corpus_bow,
                             id2word=dictionary,
                             num_topics=topic_num,
                             alpha='asymmetric',
                             minimum_probability=0.0001,
                             minimum_phi_value=0.00001,
                             passes=4,
                             workers=2)
    print '多线程训练耗时%ds' % (time.time() - start)
    # 保存模型
    model_lda.save(model_path)
    def get_lda(self, lower_bound, higher_bound, read_corpus=None, save=True):
        if read_corpus!=None:
            with open(self.path + read_corpus, "rb") as file:
                corpus = pickle.load(file)
            file.close()
            self.corpus = corpus
            del corpus

        self.dictionary = gensim.corpora.Dictionary(self.corpus)
        self.dictionary.filter_extremes(no_below=2, no_above=0.1)
        self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in self.corpus]
        #lda_models_coherence_cV = []
        for num_topics in tqdm(range(lower_bound, higher_bound),desc="Training LDAs"):
            model_lda = LdaMulticore(corpus = self.bow_corpus,
                                    num_topics=num_topics,
                                    id2word=self.dictionary,
                                    workers=8)

            coherencemodel = CoherenceModel(model=model_lda,
                                            texts=self.corpus,
                                            dictionary=self.dictionary,
                                            coherence='c_v')

            coherence_value = coherencemodel.get_coherence()
            #lda_models_coherence_cV.append(coherence_value)
            if num_topics < 10:
                num_topics = "0"+str(num_topics)
            self.lda_models[str(num_topics)] = model_lda
            self.coherence[str(num_topics)] = coherence_value
            model_lda.save(self.path+"/topic/lda_"+str(num_topics)+".model")
            coherencemodel.save(self.path+"/coherence/coherence_" + str(num_topics) + ".model")

        if save:
            pickle.dump(self.bow_corpus, open(self.path + "/data/bow_corpus.pkl", 'wb'))
            pickle.dump(self.dictionary, open(self.path + "/data/Ldictionary.pkl", 'wb'))
            pickle.dump(self.lda_models, open(self.path + "/data/models.pkl", 'wb'))
            pickle.dump(self.coherence, open(self.path + "/data/models.pkl", 'wb'))
            json.dump(self.coherence, open(self.path + "/coherences.json", 'bw'))
Ejemplo n.º 24
0
def train_lda(n_topics=10):
    with open("../result/ad_issue_reviews") as fin:
        reviews = json.load(fin)
    # build bag-of-words, corpus
    reviews = [[word for word in review if word not in stopwords.words('english')] for review in reviews]
    from collections import defaultdict
    freq = defaultdict(int)
    for review in reviews:
        for token in review:
            freq[token] += 1
    reviews = [[token for token in review if freq[token] > 1] for review in reviews]
    # dictionary = corpora.Dictionary(reviews)
    # only select ad related word
    with open("../result/relevant_ad_issues.json") as fin:
        ad_words = json.load(fin)
    ad_words = ad_words["ad"]
    dictionary = corpora.Dictionary([ad_words])

    corpus = [dictionary.doc2bow(review) for review in reviews]
    logging.info("LDA start training...")
    lda = LdaMulticore(corpus, num_topics=n_topics)

    lda.save("../model/lda_ad_%d.model"%n_topics)
    return lda
Ejemplo n.º 25
0
def train(n_topics=num_topics):
    '''Train LDA model'''

    docs = read_ap.get_processed_docs()

    docs = [d for i, d in docs.items()]

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=50)

    # save the dictionary
    with open('./objects/dictionary_lda', 'wb') as f:
        pkl.dump(dictionary, f)

    # creating bow
    print('creating bow corpus')
    corpus_bow = [dictionary.doc2bow(d) for d in docs]
    # creating binary bow
    print('creating binary bow')
    corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow]

    # with open(os.path.join(objects_path, 'corpus'), 'wb') as f:
    #     pickle.dump(corpus_tfidf, f)

    print(f'{time.ctime()} Start training LDA (BOW)')
    lda_bow = LdaMulticore(workers=5,
                           corpus=corpus_binary,
                           id2word=dictionary,
                           chunksize=1000,
                           num_topics=n_topics,
                           dtype=np.float64)

    # save models to disk
    os.makedirs(models_path, exist_ok=True)

    lda_bow.save(os.path.join(models_path, f'lda_bow_multi'))
class Embedding(metaclass=SingletonMetaclass):
    def __init__(self):
        '''
        @description: This is embedding class. Maybe call so many times. we need use singleton model.
        In this class, we can use tfidf, word2vec, fasttext, autoencoder word embedding
        @param {type} None
        @return: None
        '''
        # 停止词
        self.stopwords = []
        with open(config.stopwords, encoding='utf-8', mode='r') as f:
            for line in f.readlines():
                self.stopwords.append(line.strip())

        self.tfidf = None
        self.w2v = None
        self.LDAmodel = None

    def load_data(self, path):
        '''
        @description:Load all data, then do word segment
        @param {type} None
        @return:None
        '''
        data = pd.read_csv(path, sep='\t', header=0)
        data = data.fillna("")

        # 对data['text']中的词进行分割,并去除停用词 参考格式: data['text'] = data["text"].apply(lambda x: " ".join(x))
        data['text'] = data['text'].apply(lambda x: " ".join(
            [w for w in x.split() if w not in self.stopwords and w != '']))

        self.labelToIndex = label2idx(data)
        data['label'] = data['label'].map(self.labelToIndex)
        data['label'] = data.apply(lambda row: float(row['label']), axis=1)
        data = data[['text', 'label']]

        # self.train, _, _ = np.split(data[['text', 'label']].sample(frac=1), [int(data.shape[0] * 0.7), int(data.shape[0] * 0.9)])
        self.train = data['text'].tolist()

        vocab = {}
        for sentence in self.train:
            for word in sentence.split():
                if word not in vocab:
                    vocab[word] = 1
                else:
                    vocab[word] += 1

        with open(config.vocab_path, "w", encoding='utf-8') as f:
            for k, v in vocab.items():
                f.write("%s %s\n" % (k, v))

    def trainer(self):
        '''
        @description: Train tfidf,  word2vec, fasttext and autoencoder
        @param {type} None
        @return: None
        '''
        #count_vect 对 tfidfVectorizer 初始化
        logging.info("Training tfidf..........")
        count_vect = TfidfVectorizer(stop_words=self.stopwords,
                                     max_df=0.4,
                                     min_df=0.001,
                                     ngram_range=(1, 2))

        self.tfidf = count_vect.fit(self.train)
        self.train = [sample.split() for sample in self.train]

        #对 w2v 初始化 并建立词表,训练
        logging.info("Training word2vec..........")
        self.w2v = models.Word2Vec(sentences=self.train,
                                   min_count=2,
                                   window=5,
                                   vector_size=300,
                                   sample=6e-5,
                                   alpha=0.03,
                                   min_alpha=0.0007,
                                   negative=15,
                                   workers=4,
                                   max_vocab_size=50000)
        self.w2v.build_vocab(self.train, update=True)
        self.w2v.train(self.train,
                       total_examples=self.w2v.corpus_count,
                       epochs=15)

        self.id2word = gensim.corpora.Dictionary(self.train)
        corpus = [self.id2word.doc2bow(text) for text in self.train]
        logging.info(corpus[:5])

        # 建立LDA模型
        logging.info("Training LDA model..........")
        self.LDAmodel = LdaMulticore(corpus=corpus,
                                     id2word=self.id2word,
                                     num_topics=30)

    def saver(self):
        '''
        @description: save all model
        @param {type} None
        @return: None
        '''
        if not os.path.exists("model"):
            os.makedirs("model")

        joblib.dump(self.tfidf, './model/tfidf')

        self.w2v.wv.save_word2vec_format('./model/w2v.bin', binary=False)

        self.LDAmodel.save('./model/lda')

    def load(self):
        '''
        @description: Load all embedding model
        @param {type} None
        @return: None
        '''
        self.tfidf = joblib.load('./model/tfidf')
        self.w2v = models.KeyedVectors.load_word2vec_format('./model/w2v.bin',
                                                            binary=False)
        self.lda = models.ldamodel.LdaModel.load('./model/lda')
Ejemplo n.º 27
0
class Embedding(metaclass=SingletonMetaclass):
    def __init__(self):
        '''
        @description: This is embedding class. Maybe call so many times. we need use singleton model.
        In this class, we can use tfidf, word2vec, fasttext, autoencoder word embedding
        @param {type} None
        @return: None
        '''
        # 停止词
        self.stopWords = open(root_path + '/data/stopwords.txt',
                              encoding='utf-8').readlines()
        # autuencoder
        self.ae = AutoEncoder()

    def load_data(self):
        '''
        @description:Load all data, then do word segment
        @param {type} None
        @return:None
        '''
        logger.info('load data')
        self.data = pd.concat([
            pd.read_csv(root_path + '/data/train.tsv', sep='\t'),
            pd.read_csv(root_path + '/data/dev.tsv', sep='\t'),
            pd.read_csv(root_path + '/data/test.tsv', sep='\t')
        ])
        self.data["text"] = self.data['title'] + self.data['desc']
        self.data["text"] = self.data["text"].apply(query_cut)
        self.data['text'] = self.data["text"].apply(lambda x: " ".join(x))

    def trainer(self):
        '''
        @description: Train tfidf,  word2vec, fasttext and autoencoder
        @param {type} None
        @return: None
        '''
        logger.info('train tfidf')
        count_vect = TfidfVectorizer(stop_words=self.stopWords,
                                     max_df=0.4,
                                     min_df=0.001,
                                     ngram_range=(1, 2))
        self.tfidf = count_vect.fit(self.data["text"])
        logger.info('train word2vec')

        self.data['text'] = self.data["text"].apply(lambda x: x.split(' '))
        self.w2v = models.Word2Vec(min_count=2,
                                   window=5,
                                   size=300,
                                   sample=6e-5,
                                   alpha=0.03,
                                   min_alpha=0.0007,
                                   negative=15,
                                   workers=4,
                                   iter=30,
                                   max_vocab_size=50000)
        self.w2v.build_vocab(self.data["text"])
        self.w2v.train(self.data["text"],
                       total_examples=self.w2v.corpus_count,
                       epochs=15,
                       report_delay=1)

        logger.info('train fast')
        # 训练fast的词向量
        self.fast = models.FastText(
            self.data["text"],
            size=300,  # 向量维度
            window=3,  # 移动窗口
            alpha=0.03,
            min_count=2,  # 对字典进行截断, 小于该数的则会被切掉,增大该值可以减少词表个数
            iter=30,  # 迭代次数
            max_n=3,
            word_ngrams=2,
            max_vocab_size=50000)

        logger.info('train lda')
        self.id2word = gensim.corpora.Dictionary(self.data.text)
        corpus = [self.id2word.doc2bow(text) for text in self.data.text]
        self.LDAmodel = LdaMulticore(corpus=corpus,
                                     id2word=self.id2word,
                                     num_topics=30,
                                     workers=2,
                                     chunksize=4000,
                                     passes=7,
                                     alpha='asymmetric')

        logger.info('train autoencoder')
        self.ae.train(self.data)

    def saver(self):
        '''
        @description: save all model
        @param {type} None
        @return: None
        '''
        logger.info('save autoencoder model')
        self.ae.save()

        logger.info('save tfidf model')
        joblib.dump(self.tfidf, root_path + '/model/embedding/tfidf')

        logger.info('save w2v model')
        self.w2v.wv.save_word2vec_format(root_path +
                                         '/model/embedding/w2v.bin',
                                         binary=False)

        logger.info('save fast model')
        self.fast.wv.save_word2vec_format(root_path +
                                          '/model/embedding/fast.bin',
                                          binary=False)

        logger.info('save lda model')
        self.LDAmodel.save(root_path + '/model/embedding/lda')

    def load(self):
        '''
        @description: Load all embedding model
        @param {type} None
        @return: None
        '''
        logger.info('load tfidf model')
        self.tfidf = joblib.load(root_path + '/model/embedding/tfidf')

        logger.info('load w2v model')
        self.w2v = models.KeyedVectors.load_word2vec_format(
            root_path + '/model/embedding/w2v.bin', binary=False)

        logger.info('load fast model')
        self.fast = models.KeyedVectors.load_word2vec_format(
            root_path + '/model/embedding/fast.bin', binary=False)

        logger.info('load lda model')
        self.lda = LdaModel.load(root_path + '/model/embedding/lda')

        logger.info('load autoencoder model')
        self.ae.load()
Ejemplo n.º 28
0
def model(n_topics,
          alpha=None,
          beta=None,
          saved=False,
          pyldavis=False,
          wordclouds=False,
          rep_letters=False,
          plots=False) -> dict:
    assert n_topics >= 2
    """
	aux functions to make sure it's loading the desired model
	"""
    def verify_alpha(lda_model, given):
        actual: list = lda_model.alpha
        if given == "asymmetric":
            return not np.isclose(actual[0], actual[-1])
        elif given == "symmetric":
            return np.isclose(actual[0], actual[-1])
        else:
            return np.isclose(given, actual[0]) and np.isclose(
                given, actual[-1])

    def verify_beta(lda_model, given):
        actual = lda_model.eta
        if type(given) == float:
            return np.isclose(given, actual[0]) and np.isclose(
                given,
                actual[-1])  # basic == comparison doesn't work bc floats suck
        else:
            return False

    print(f"Building LDA model for {n_topics} topics.")

    if saved:
        lda = LdaMulticore.load(f"{TRAINED_LDA}{n_topics}")

        # if not (verify_alpha(lda, alpha) and verify_beta(lda, beta)):
        # print("Loaded model didn't pass parameter verification; train it from scratch or load the correct one.")
        # return

        print(f"Trained LDA model with {n_topics} topics loaded successfully.")

    else:
        lda = LdaMulticore(
            corpus,
            num_topics=n_topics,
            id2word=dictionary,
            passes=20,
            alpha=alpha if alpha is not None else "symmetric",  # default
            eta=beta,
            random_state=1,
            iterations=100,
            eval_every=5,
            workers=3,
            per_word_topics=True)

        lda.save(f"{TRAINED_LDA}{n_topics}")
        print(
            f"LDA model with {n_topics} topics trained and saved successfully."
        )
    """
	save per-word-topics 3D matrix
	[!] alters global variable
	"""
    V = len(dictionary)
    K = n_topics
    N = len(corpus)
    global pwt
    pwt = np.zeros((V, K, N))
    """
	save topic assignment info in dataframes
	[!] alters global variables
	"""
    global vw
    global vws
    vws = get_topic_dists_dataframe(lda)
    vw, vws = set_main_topics(vw, vws)
    """
	coherence and silhouette scores
	"""
    coherence = CoherenceModel(model=lda,
                               texts=letters,
                               dictionary=dictionary,
                               coherence='c_v').get_coherence()
    print(f"Coherence score: {coherence}")  # the higher the better

    avg_silhouette = plot_silhouette(vws)
    print(f"Average silhouette coefficient: {avg_silhouette}"
          )  # the higher the better
    """
	other validation methods
	"""
    if pyldavis:
        vis = pyLDAvis.gensim.prepare(topic_model=lda,
                                      corpus=corpus,
                                      dictionary=dictionary,
                                      n_jobs=3)
        pyLDAvis.save_html(vis, f"{PYLDAVIS_PATH}/lda{n_topics}.html")

    if rep_letters:
        save_representative_letters(vws, 3)

    if wordclouds:
        save_topic_wordclouds(pwt)

    if plots:
        plot_topics_per_year(vw)
        plot_topics_per_recipient(vw)

    return {
        "model": lda,
        "num_topics": n_topics,
        "alpha": alpha,
        "beta": beta,
        "coherence": coherence,
        "silhouette": avg_silhouette,
        "vws": vws,
        "pwt": pwt
    }
Ejemplo n.º 29
0
                             random_state=100,
                             num_topics=num_topics,
                             passes=10,
                             chunksize=1000,
                             batch=False,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)

    # save the model
    lda_model.save('tmp/lda_model.model')

    # See the topics
    lda_model.print_topics(-1)

    for c in lda_model[corpus[5:8]]:
        print("Document Topics      : ", c[0])  # [(Topics, Perc Contrib)]
        print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
        print("Phi Values (word id) : ",
              c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
        print("Word, Topics         : ",
              [(dct[wd], topic)
               for wd, topic in c[1][:2]])  # [(Word, [Topics])]
        print("Phi Values (word)    : ",
              [(dct[wd], topic)
               for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
Ejemplo n.º 30
0
def train_save(n_topics, uni_or_bi, bow_corpus, dic):

    name = str(n_topics) + "-topics"
    lda_model = LDA(bow_corpus, num_topics=n_topics, id2word=dic, passes=2)
    lda_model.save(os.getcwd() + "/LDA models/{}/{}".format(uni_or_bi, name))
Ejemplo n.º 31
0
                     iterations=iterations,
                     num_topics=num_topics,
                     passes=passes,
                     eval_every=eval_every,
                     workers=4)

top_topics = model.top_topics(corpus)  #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)
print(top_topics)
numpy.save(os.path.join(out_path, "topics.npy"), top_topics)
model.save(os.path.join(out_path, "lda_model"))

#predict a topic for a document
important_words = docs[2]
print(important_words)
print(len(important_words))

ques_vec = []
ques_vec = dictionary.doc2bow(important_words)
print("ques_vec", ques_vec)

topic_vec = []
topic_vec = model[ques_vec]
print("topic_vec", topic_vec)

word_count_array = numpy.empty((len(topic_vec), 2), dtype=numpy.object)