Esempi in Python per LdaModel.LdaModel, esempi in Python per gensim.models.ldamodel.LdaModel.LdaModel

Esempio n. 1

0

Mostra file

    def lda_model(self,
                  num_topics: [int, None] = 10,
                  passes: [int, None] = 1,
                  seed: [int, None] = None):
        """
        Construct LDA topic models for each year in a
        corpus, given a set of parameters.
        """

        if self.word_to_id is None or self.corpora is None:
            self.build_dictionaries_and_corpora()

        results = num_dict(self.year_list)

        if seed is None:

            for year in self.year_list[:-1]:
                results[year] = \
                    LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year],
                             num_topics=num_topics, passes=passes)

        else:

            rand = RandomState(seed)
            for year in self.year_list[:-1]:
                results[year] = \
                    LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year],
                             num_topics=num_topics, passes=passes, random_state=rand)

        return TopicResults(results, self.num_docs, self.name)

Esempio n. 2

0

Mostra file

File: LDA.py Progetto: stephenhky/PyBibleNLP

 def trainModel(self):
     if self.toweight:
         self.model = LdaModel(self.tfidf[self.corpus], num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]])
     else:
         self.model = LdaModel(self.corpus, num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.corpus])

Esempio n. 3

0

Mostra file

def create_models(df):
    ''' creates/saves two LDA models (one genre, one subgenre) in a folder called lda_models '''
    df = get_all_genres()
    id2word = corpora.Dictionary(df.genres)
    word2id = {v: k for k, v in id2word.items()}
    corpus = [id2word.doc2bow(genres) for genres in df.genres]
    # captures subgenres with 50 categories
    subgenre_model = LdaModel(corpus=corpus,
                              id2word=id2word,
                              num_topics=50,
                              random_state=100,
                              update_every=1,
                              passes=5,
                              alpha='auto',
                              per_word_topics=True)
    # capture main genres with 10 categories
    genre_model = LdaModel(corpus=corpus,
                           id2word=id2word,
                           num_topics=10,
                           random_state=100,
                           update_every=1,
                           passes=5,
                           alpha='auto',
                           per_word_topics=True)
    subgenre_model.save('lda_models/subgenre.model')
    genre_model.save('lda_models/genre.model')

Esempio n. 4

0

Mostra file

File: lda.py Progetto: KevinShih707/Crawl-Curation

 def __trainingModel(self):
     if (self.seed != None):
         self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair,
                                  id2word=self.corpora.Dictionary,
                                  num_topics=self.numTopics,
                                  random_state=np.random.RandomState(
                                      self.seed))
     else:
         self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair,
                                  id2word=self.corpora.Dictionary,
                                  num_topics=self.numTopics)

Esempio n. 5

0

Mostra file

File: topicModel.py Progetto: sslogar/coherency-tracker

def getCoherency(d, corp, topics=10, coherence='u-mass', varyTopics=False):
    m1 = LdaModel(corp, topics, d)
    cm = CoherenceModel(model=m1, corpus=corp, coherence='u_mass')
    if varyTopics:
        topics = range(5, 16)
        coherencies = []
        for topic in topics:
            m = LdaModel(corp, topic, d)
            c = CoherenceModel(model=m, corpus=corp, coherence='u_mass')
            coherencies.append(c.get_coherence())
        return np.max(coherencies)
    return cm.get_coherence()

Esempio n. 6

0

Mostra file

File: semantic_vectors_generator.py Progetto: Lipairui/Text-semantic-similarity

def getLdaFeature(documents, topicNum):
    '''
     Funciton:
         generate lda features by training lda model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lda features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)    
    corpusD = [dictionary.doc2bow(text) for text in texts]

    # train lda model
#     LogInfo(' Train LDA model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
#     ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    # generate lda features
    LogInfo(' Generate LDA features...')
    ldaFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpus_tfidf:
        topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01)
        for t in topic:
             ldaFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlda")
    ldaFeature = pd.DataFrame(ldaFeature, columns = colName)
    return ldaFeature

Esempio n. 7

0

Mostra file

    def coherence_values(self,
                         limit,
                         start=2,
                         step=2,
                         random_state=24,
                         passes=20):
        coherence_values = []
        model_list = []

        for num_topics in range(start, limit, step):
            model = LdaModel(self.corpus,
                             num_topics=num_topics,
                             id2word=self.dictionary,
                             random_state=random_state,
                             passes=passes)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model,
                                            texts=self.doc_list,
                                            dictionary=self.dictionary,
                                            coherence="c_v")
            coherence_values.append(coherencemodel.get_coherence())
            print("Model with # of topics",
                  num_topics,
                  "has coherence:",
                  coherencemodel.get_coherence(),
                  end="\r",
                  flush=True)

        return model_list, coherence_values

Esempio n. 8

0

Mostra file

    def ldamodel(self, num_topics, random_state=24, passes=20):

        return LdaModel(self.corpus,
                        num_topics=num_topics,
                        id2word=self.dictionary,
                        random_state=random_state,
                        passes=passes)

Esempio n. 9

0

Mostra file

File: CustomerReviews.py Progetto: ZJane/commentAnalysis

 def find_topic(self,condition=None,n_topics=10,n_words=10,topic_model='lda',vec_model='tf',show=True,**kwargs):
     '''主题模型，和上面那个函数，优先使用该函数
     parameter
     ---------
     condition: 语料逻辑值，可以用于专门对好评/差评进行主题分解
     n_topics: 主题数
     n_words: 每个主题输出的词语数
     vec_model: 向量化方法，默认是tf
     '''
     if condition is not None:
         texts=self.texts_seg[condition]
     else:
         texts=self.texts_seg
     if topic_model in ['lda','LDA']:
         dictionary = corpora.Dictionary([doc.split(' ') for doc in texts])
         corpus = [dictionary.doc2bow(text.split(' ')) for text in texts]
         if vec_model in ['idf','tfidf']:
             tfidf = models.TfidfModel(corpus)
             corpus = tfidf[corpus]
         lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics)
         topics_keywords=lda.show_topics(num_topics=n_topics, num_words=n_words,formatted=False)
         if show:
             print('\n'.join(['主题 {}: {}'.format(i,' | '.join([k[0] for k in \
             topic[1]])) for i,topic in enumerate(topics_keywords)]))
         return topics_keywords

Esempio n. 10

0

Mostra file

File: GensimTM.py Progetto: ankitrajshree/TopicModeling

def build_model(dictionary, corpus, n_topics, lemmatized_notes):
    # Build LDA model
    coh_val_lda = []
    coh_val_lda_mallet = []
    model_lda = []
    model_mallet = []
    for topic in n_topics:
        lda_model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=topic,
                             random_state=100,
                             update_every=1,
                             chunksize=100,
                             passes=10,
                             alpha='auto',
                             per_word_topics=True)
        coh_lda_model = CoherenceModel(model=model_lda,
                                       texts=lemmatized_notes,
                                       dictionary=dictionary,
                                       coherence='c_v')
        coh_val_lda.append(coh_lda_model.get_coherence())
        model_lda.append(lda_model)
        # Build LDA Mallet model
        mallet_path = 'mallet/bin/mallet'
        lda_mallet = LdaMallet(mallet_path,
                               corpus=corpus,
                               num_topics=n_topics,
                               id2word=dictionary)
        coh_lda_model = CoherenceModel(model=lda_mallet,
                                       texts=lemmatized_notes,
                                       dictionary=dictionary,
                                       coherence='c_v')
        model_mallet.append(lda_mallet)
        coh_val_lda_mallet.append(coh_lda_model.get_coherence())
    return model_mallet, coh_val_lda_mallet, model_lda, coh_val_lda

Esempio n. 11

0

Mostra file

    def fit_model(self, data, params, return_data=False):
        """
        Fit model to `data` using gensim with parameter set `params`.
        """
        from gensim.models.ldamodel import LdaModel

        dictionary = params.pop('dictionary', None)

        if hasattr(data, 'dtype') and hasattr(data, 'shape') and hasattr(
                data, 'transpose'):
            corpus = dtm_to_gensim_corpus(data)
            dtm = data
        else:
            if isinstance(data, tuple) and len(data) == 2:
                dictionary, corpus = data
            else:
                corpus = data
            dtm = gensim_corpus_to_dtm(corpus)

        model = LdaModel(corpus, id2word=dictionary, **params)

        if return_data:
            return model, (corpus, dtm)
        else:
            return model

Esempio n. 12

0

Mostra file

File: nlp.py Progetto: mohamedabdelbary/kaggle-quora

def train_lda(n_topics, id2word_dictionary=None, documents=None, corpus=None):
    """
    Training method for LDA. documents is a list of lists of words/tokens
    documents is used to construct a dictionary and a corpus from which the
    topics for LDA are inferred
    """
    # Construct dictionary of words if it's not passed
    if not id2word_dictionary:
        id2word_dictionary = corpora.Dictionary(documents)

    word2idx_dictionary = dict([(w, idx) for (idx, w) in id2word_dictionary.items()])

    # Construct corpus for model
    if documents and not corpus:
        corpus = [id2word_dictionary.doc2bow(document) for document in documents]

    # Cluster the documents into topics using LDA. number of topics is given
    # by n_topics
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word_dictionary,
                         num_topics=n_topics,
                         update_every=1,
                         chunksize=10000,
                         passes=1)

    """
    Default value for topn (number of top words to show by probability) is 10.
    A high enough value should return the words covering most or all of the
    probability mass
    """
    topics = [lda_model.show_topic(idx, topn=50000)
              for idx in range(0, n_topics)]

    return lda_model, id2word_dictionary, word2idx_dictionary, topics

Esempio n. 13

0

Mostra file

File: gemsimLDA.py Progetto: Winniekun/Data_Mining

def makeLDA(path, num_topics, num_words, passes):
    num_topics = num_topics  # 模型中寻找主题的数量
    num_words = num_words  # 从每个主题中看到多少单词
    passes = passes  # 重复检查数据多少次
    with open(filename, encoding='utf-8') as f:
        documents = f.readlines()
        texts = [[
            word for word in document.lower().split()
            if word not in STOPWORDS and word.isalnum()
        ] for document in documents]

    # print(texts)
    # 从单词列表中创建一个字典和一个语料库
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(corpus,
                   id2word=dictionary,
                   num_topics=num_topics,
                   passes=passes)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words=num_words))
    unseennText = '../../../data/LDA_data/lkmlSingleNewEmail.txt'
    with open(unseennText, encoding='utf-8') as fenw:
        newdoc = fenw.read()
    newcourpus = dictionary.doc2bow(
        newword for newword in newdoc.lower().split()
        if newword not in STOPWORDS and newword.isalnum())

    #将新的语料库传入现有的LDA模型
    pp.pprint(lda[newcourpus])

Esempio n. 14

0

Mostra file

 def train(self, number_passes):
     mapping_word2id, document_terms_matrix = self.numerical_corpus
     model = LdaModel(document_terms_matrix,
                      num_topics=self.number_topics,
                      id2word=mapping_word2id,
                      passes=number_passes)
     self.model = model

Esempio n. 15

0

Mostra file

File: Sharding.py Progetto: informagi/ipfsearch

def trainModel():
    """ Train a model
    """
    if args.mode == 'Random':
        return args.topics, 0
    # need to train on dump
    files = [
        f"{args.input}/{f}" for f in os.listdir(args.input)
        if os.path.isfile(os.path.join(args.input, f))
    ]
    if args.mode == 'LDA':
        # create dictionary
        with open(files[0], "r", encoding='utf-8') as f:
            dct = Dictionary([' '.join(f.readlines()).split()])
        for filename in files[1:]:
            with open(filename, "r", encoding='utf-8') as f:
                dct.add_documents([' '.join(f.readlines()).split()])
        # create corpus
        corpus = []
        for filename in files:
            with open(filename, "r", encoding='utf-8') as f:
                corpus.append(dct.doc2bow(' '.join(f.readlines()).split()))
        lda = LdaModel(corpus, num_topics=args.topics)
        lda.save("./models/LDAdump.model")
        dct.save("./models/LDAdump.dct")
        return lda, dct
    if args.mode == 'loadLDA':
        return LdaModel.load("./models/LDAdump.model"), Dictionary.load(
            "./models/LDAdump.dct")

Esempio n. 16

0

Mostra file

File: kindness_index.py Progetto: katchinsky/leadership-practice

def create_LDA(comment_dict,
               num_topics=20,
               chunk_size=50,
               max_iter=20,
               from_db=True,
               get_data_func=None):
    lda = None
    text_gen = data_preprocessor(max_iter=max_iter,
                                 from_db=from_db,
                                 get_data_func=get_data_func)
    corpus = []
    for _, stemmed_text, _ in text_gen:
        if len(stemmed_text) != 0:
            corpus.append(comment_dict.doc2bow(stemmed_text))
        if len(corpus) == chunk_size:
            if lda is None:
                lda = LdaModel(corpus=corpus,
                               num_topics=num_topics,
                               id2word=comment_dict,
                               per_word_topics=1,
                               passes=10)
            else:
                lda.update(corpus=corpus)
            corpus = []
    return lda

Esempio n. 17

0

Mostra file

def train_lda(recipe_file,num_topics,output_file):
    corpus = RecipeCorpus(recipe_file)
    
    corpora.MmCorpus.serialize(output_file+'.corpus.mm', corpus)
    lda = LdaModel(corpus, id2word=corpus.dictionary, num_topics=int(num_topics), distributed=False)
    lda.save(output_file)
    return lda

Esempio n. 18

0

Mostra file

 def generate_docs_lda(self,
                       dictionary_file_path,
                       tfidf_file_path,
                       lda_file_path,
                       num_topics=100):
     """
     生成文档库lda主题文件
     :param dictionary_file_path:
     :param tfidf_file_path:
     :param lda_file_path:
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_file_path)
         tfidf_corpus = corpora.MmCorpus(tfidf_file_path)
         lda = LdaModel(corpus=tfidf_corpus,
                        id2word=dictionary,
                        num_topics=100,
                        update_every=0,
                        passes=20)
         with open(lda_file_path, 'wb') as f:
             pickle.dump(lda, f)
             logger.info('lda model file building finished')
     except Exception as e:
         logger.error('generate documents library lda file failed for %s' %
                      str(e))

Esempio n. 19

0

Mostra file

File: Text_Analysis.py Progetto: charliesusername/Beer-Advisor

def compute_coherence_values(dictionary, corpus, texts, limit=40, start=2, step=6):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print(num_topics)
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        print(num_topics)
        model_list.append(model)
        print(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        print(coherencemodel)
        coherence_values.append(coherencemodel.get_coherence())
        print(num_topics)

    return model_list, coherence_values

Esempio n. 20

0

Mostra file

def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`.

    This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model.

    Parameters
    ----------
    mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet`
        Trained Mallet model
    gamma_threshold : float, optional
        To be used for inference in the new LdaModel.
    iterations : int, optional
        Number of iterations to be used for inference in the new LdaModel.

    Returns
    -------
    :class:`~gensim.models.ldamodel.LdaModel`
        Gensim native LDA.

    """
    model_gensim = LdaModel(
        id2word=mallet_model.id2word,
        num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha,
        iterations=iterations,
        gamma_threshold=gamma_threshold,
        dtype=numpy.
        float64  # don't loose precision when converting from MALLET
    )
    model_gensim.expElogbeta[:] = mallet_model.wordtopics
    return model_gensim

Esempio n. 21

0

Mostra file

File: test_coherencemodel.py Progetto: trideeprath/mr2vec_train

 def setUp(self):
     # Suppose given below are the topics which two different LdaModels come up with.
     # `topics1` is clearly better as it has a clear distinction between system-human
     # interaction and graphs. Hence both the coherence measures for `topics1` should be
     # greater.
     self.topics1 = [['human', 'computer', 'system', 'interface'],
                     ['graph', 'minors', 'trees', 'eps']]
     self.topics2 = [['user', 'graph', 'minors', 'system'],
                     ['time', 'graph', 'survey', 'minors']]
     self.ldamodel = LdaModel(corpus=corpus,
                              id2word=dictionary,
                              num_topics=2,
                              passes=0,
                              iterations=0)
     mallet_home = os.environ.get('MALLET_HOME', None)
     self.mallet_path = os.path.join(mallet_home, 'bin',
                                     'mallet') if mallet_home else None
     if self.mallet_path:
         self.malletmodel = LdaMallet(mallet_path=self.mallet_path,
                                      corpus=corpus,
                                      id2word=dictionary,
                                      num_topics=2,
                                      iterations=0)
     vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
     if not vw_path:
         msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
         logging.info(msg)
         self.vw_path = None
     else:
         self.vw_path = vw_path
         self.vwmodel = LdaVowpalWabbit(self.vw_path,
                                        corpus=corpus,
                                        id2word=dictionary,
                                        num_topics=2,
                                        passes=0)

Esempio n. 22

0

Mostra file

File: ldavowpalwabbit.py Progetto: vinayaktrivedi/bhagwad_geeta_chatbot

def vwmodel2ldamodel(vw_model, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to
    :class:`~gensim.models.ldamodel.LdaModel`.

    This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel
    into the gensim model.

    Parameters
    ----------
    vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`
        Trained Vowpal Wabbit model.
    iterations : int
        Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`.

    Returns
    -------
    :class:`~gensim.models.ldamodel.LdaModel`.
        Gensim native LDA.

    """
    model_gensim = LdaModel(num_topics=vw_model.num_topics,
                            id2word=vw_model.id2word,
                            chunksize=vw_model.chunksize,
                            passes=vw_model.passes,
                            alpha=vw_model.alpha,
                            eta=vw_model.eta,
                            decay=vw_model.decay,
                            offset=vw_model.offset,
                            iterations=iterations,
                            gamma_threshold=vw_model.gamma_threshold,
                            dtype=numpy.float32)
    model_gensim.expElogbeta[:] = vw_model._get_topics()
    return model_gensim

Esempio n. 23

0

Mostra file

File: lda_model.py Progetto: ryancallihan/project-callihan-tureski

    def create_model(self,
                     doc_matrix,
                     term_dictionary,
                     model_path,
                     save_model=True,
                     language='language_na'):
        """
        Creates an LDA model based on a set of documents
        :param model_path:
        :param doc_matrix:
        :param term_dictionary:
        :param save_model:
        :param language:
        :return LDA model:
        """
        self.language = language
        start = time()
        self.ldamodel = LdaModel(doc_matrix,
                                 num_topics=self.num_categories,
                                 id2word=term_dictionary,
                                 passes=50)

        if save_model:
            self.save_model(model_path=os.path.join(
                model_path, 'models', self.language,
                '%s_%s_category_lda.model' %
                (language, str(self.num_categories))))

        logging.info('Training lasted: {:.2f}s'.format(time() - start))
        return self.ldamodel

Esempio n. 24

0

Mostra file

def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print(f"Train {num_topics}")
        model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

        x = range(start, num_topics + step, step)
        coherence_pairs = (x, coherence_values)
        with open(
                os.path.join("/home/norpheo/Documents/thesis",
                             "coherence_pair_umass.pickle"), "wb") as handle:
            pickle.dump(coherence_pairs, handle)

    return model_list, coherence_values

Esempio n. 25

0

Mostra file

File: example.py Progetto: afcarl/sklearn_gensim_example

def fit_lda(X, vocab, num_topics=5, passes=20):
    """ Fit LDA from a scipy CSR matrix (X). """
    print('fitting lda...')
    return LdaModel(matutils.Sparse2Corpus(X.T),
                    num_topics=num_topics,
                    passes=passes,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]))

Esempio n. 26

0

Mostra file

def fit_LdaModel(gensim_df,
                 id2word,
                 num_topics,
                 alpha,
                 passes=15,
                 iterations=10000,
                 update_every=1000,
                 chunksize=1000,
                 minimum_topic_probability=0.05,
                 forget_weight=0.5,
                 distributed=True):
    model = LdaModel(
        corpus=gensim_df,
        id2word=id2word,
        num_topics=num_topics,
        alpha=alpha,
        passes=passes,  # epochs
        iterations=iterations,
        update_every=update_every,  #batch size
        chunksize=chunksize,  #batch size
        minimum_probability=minimum_topic_probability,
        decay=forget_weight,
        per_word_topics=True,
        distributed=distributed)
    return model

Esempio n. 27

0

Mostra file

def lda_extractor(corpus, dictionary, num_topics=1):
    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
    )
    return lda

Esempio n. 28

0

Mostra file

File: online_features.py Progetto: sarweshkrishnan/twitter-fake

def topic_model(df_train, df_test, topic_count=10):
    ## general remove text
    df_train['tweet'] = df_train['tweet'].map(general_text_processing)
    df_test['tweet'] = df_test['tweet'].map(general_text_processing)

    ## remove stop words
    df_train['tweet'] = df_train['tweet'].map(remove_stop_words)
    df_test['tweet'] = df_test['tweet'].map(remove_stop_words)

    ## gensim lda
    from gensim.corpora.dictionary import Dictionary
    from gensim.models.ldamodel import LdaModel
    dictionary = Dictionary()
    for t in df_train.tweet.values.tolist():
        #print(t)
        dictionary.add_documents([t.split()])
    #for  t in df_test['tweet'].values.tolist() :
    #print(t)
    # print(t[0].split())
    #print(dictionary.doc2bow(t.split()))
    train_doc2_corupus = [
        dictionary.doc2bow(text.split())
        for text in df_train['tweet'].values.tolist()
    ]
    #print(train_doc2_corupus)
    lda_model = LdaModel(train_doc2_corupus, num_topics=topic_count)
    """
    fill topics
    """
    df_test = fill_lda_result(df_test, lda_model, dictionary, topic_count)
    df_train = fill_lda_result(df_train, lda_model, dictionary, topic_count)
    """
    return 
    """
    return df_train, df_test

Esempio n. 29

0

Mostra file

def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
    """
    Function to convert mallet model to gensim LdaModel. This works by copying the
    training model weights (alpha, beta...) from a trained mallet model into the
    gensim model.

    Args:
        mallet_model : Trained mallet model
        gamma_threshold : To be used for inference in the new LdaModel.
        iterations : number of iterations to be used for inference in the new LdaModel.

    Returns:
        model_gensim : LdaModel instance; copied gensim LdaModel
    """
    model_gensim = LdaModel(
        id2word=mallet_model.id2word,
        num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha,
        iterations=iterations,
        gamma_threshold=gamma_threshold,
        dtype=numpy.
        float64  # don't loose precision when converting from MALLET
    )
    model_gensim.expElogbeta[:] = mallet_model.wordtopics
    return model_gensim

Esempio n. 30

0

Mostra file

def vwmodel2ldamodel(vw_model, iterations=50):
    """
    Function to convert vowpal wabbit model to gensim LdaModel. This works by
    simply copying the training model weights (alpha, beta...) from a trained
    vwmodel into the gensim model.

    Args:
        vw_model : Trained vowpal wabbit model.
        iterations : Number of iterations to be used for inference of the new LdaModel.

    Returns:
        model_gensim : LdaModel instance; copied gensim LdaModel.
    """
    model_gensim = LdaModel(num_topics=vw_model.num_topics,
                            id2word=vw_model.id2word,
                            chunksize=vw_model.chunksize,
                            passes=vw_model.passes,
                            alpha=vw_model.alpha,
                            eta=vw_model.eta,
                            decay=vw_model.decay,
                            offset=vw_model.offset,
                            iterations=iterations,
                            gamma_threshold=vw_model.gamma_threshold,
                            dtype=numpy.float32)
    model_gensim.expElogbeta[:] = vw_model._get_topics()
    return model_gensim