コード例 #1
0
def visualize():
    # just for later
    import pyLDAvis
    import pyLDAvis.gensim
    vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
    pyLDAvis.enable_notebook()
    pyLDAvis.display(vis)
コード例 #2
0
    def view_clusters(self):
        '''
        
        '''
        if self.number_of_topics is None:
            print('Error: Number of topics not set.')
            print('Set number of topics with [object].set_number_of_topics(X)')
            return
        self.id2word = hf.create_id2word(self.texts)
        self.corpus = hf.create_corpus(self.id2word, self.texts)

        clusters = self.number_of_topics

        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=self.corpus,
                                                    id2word=self.id2word,
                                                    num_topics=clusters,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)

        # Display clusters
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, self.corpus, self.id2word)
        pyLDAvis.display(vis)
        return vis
コード例 #3
0
    def visualize(self, mds='pcoa'):
        """
        visualize LDA using pyLDAvis

        see: https://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb#topic=8&lambda=1&term=
        paper: https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

        Parameters
        ----------
        mds: str
            scaling function
            valild options are ['pcoa', 'tnse', mmds']

        Returns
        -------

        """
        import pyLDAvis
        import pyLDAvis.gensim

        print("Make sure you have pyLDAviz imported in the notebook:\n\n"
              "import pyLDAvis\n"
              "pyLDAvis.enable_notebook()\n")

        ldavis = pyLDAvis.gensim.prepare(self.model,
                                         self.corpus,
                                         self.dictionary,
                                         mds=mds)
        pyLDAvis.display(ldavis)

        return ldavis
コード例 #4
0
def pylda_visualize(csv_chemin, ecriture_chemin, tfidf_visualization = False, num_topic=3, filter_by_cluster=None):
    ''' gets the clustering result from csv_chemin and then writes the LDA visualisation as an html file into ecriture_chemin
        csv_chemin points to a dataframe with two columns: one corresponding to the cluster, the other containing the text
         num_topic is the number of topics we want to extract from the texts
         filter_by_cluster is the cluster index, if we want to extract topics from one cluster only
    '''
    #df = pd.read_csv('df_brown.csv')
    clustering_result_df = pd.read_csv(csv_chemin)
    if filter_by_cluster:
        clustering_result_df[clustering_result_df['pred_cluster'] == filter_by_cluster]
    text = clustering_result_df['text'].values
    #text = ' '.join(text)

    docs = pd.DataFrame(list(map(load_doc, enumerate(list(clustering_result_df['text'].apply(clean))))))
    docs.head()

    dictionary, corpus = prep_corpus(docs['tokens'])
    #dictionary : keys = word_id ; value = word
    #corpus[i] = list of tuples (word_id, count) where count is the number of occurence of the word in the text corpus[i]

    if tfidf_visualization:
        # Instead of representing each text as tuples (word_idx, term_frequency), we represent them as (word_idx, word_tfidf_weight)
        model = TfidfModel(corpus)
        new_corpus = []
        for i in range(len(corpus)):
            element = corpus[i]
            new_element = []
            for j in range(len(element)):
                #word = dictionary[pair[0]]
                pair = element[j]
                #dict_idx = pair[0]
                tfidf_vector = model[element]
                word_tfidf_weight = tfidf_vector[j]
                new_element += (pair[0], word_tfidf_weight)
            new_corpus.append(new_element)

        MmCorpus.serialize(ecriture_chemin + '.mm', corpus)
        dictionary.save(ecriture_chemin + '.dict')

        lda = models.ldamodel.LdaModel(corpus=new_corpus, id2word=dictionary, num_topics=15, passes=10)

        lda.save(ecriture_chemin + '.model')


        vis_data = gensimvis.prepare(lda, new_corpus, dictionary)
        pyLDAvis.display(vis_data)
        pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')

    else:
        MmCorpus.serialize(ecriture_chemin + '.mm', corpus)
        dictionary.save(ecriture_chemin + '.dict')

        lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic, passes=10)

        lda.save(ecriture_chemin + '.model')

        vis_data = gensimvis.prepare(lda, corpus, dictionary)
        pyLDAvis.display(vis_data)
        pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')
コード例 #5
0
 def display_data(self):
     lda = LdaMulticore.load(self.lda_model_filepath)
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath)
     LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                               trigram_dictionary)
     with open(self.LDAvis_data_filepath, 'w') as f:
         f.write(str(LDAvis_prepared))
         # json.dump(LDAvis_prepared.to_json(), f)
     with open(self.LDAvis_data_filepath) as f:
         LDAvis_prepared = f
     pyLDAvis.display(LDAvis_prepared)
def ldavis_create(lda,
                  corpus,
                  gensim_dict,
                  LDAvis_data_filepath=fpathroot + fpathappend + '_lda_vis',
                  return_ldavis=False):
    LDAvis_prepared = pyLDAvis.prepare(lda, corpus, gensim_dict)
    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
    if return_ldavis == True:
        return LDAvis_prepared
    else:
        pyLDAvis.display(LDAvis_prepared)
コード例 #7
0
def visualise(model_file, corpus_file, dictionary_file):
    # use Notebook version if not working

    print('Loading corpus from ' + corpus_file)
    corpus = MmCorpus(corpus_file)
    print('Loading dictionary from ' + dictionary_file)
    dictionary = Dictionary.load(dictionary_file)
    print('Loading model from ' + model_file)
    model = models.ldamulticore.LdaMulticore.load(model_file)

    vis_data = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_data)
    print('Please use Jupyter notebook visualise.ipynb if not working')
コード例 #8
0
def topicmodel_forproyect(id_proyect):

    df_comments = get_data(id_proyect)
    #list_mask=np.unique(df_comments.project_id)

    #mask = df_comments["project_id"] == id_proyect

    #df2 = pd.read_excel("datos_congresista_virtual.xlsx", sheet_name="clasificaciones")
    num_topics = 5

    df2 = df_comments.body
    df2 = df2.str.lower()
    pattern = r"@([A-Za-z0-9_]+)"
    df2 = df2.str.replace(pattern, '')

    elements = np.array(df2.tolist())
    tokenizer = RegexpTokenizer(r'\w+')
    es_stop = get_stop_words('es')
    p_stemmer = PorterStemmer()
    texts = []
    print(str(id_proyect))
    for i in elements:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in es_stop]
        #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stopped_tokens)
        #texts.append(stemmed_tokens)
        print(i)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20)
    #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, distributed=True, passes=20)
    try:
        ldamodel = gensim.models.ldamulticore.LdaMulticore(
            corpus, num_topics=num_topics, id2word=dictionary, passes=20)
    except ValueError:
        return "Coleccion Vacia. Aparentemente parametros faltantes o mal ingresados."

    import pyLDAvis.gensim
    import pyLDAvis

    vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    pyLDAvis.display(vis_data)

    return pyLDAvis.prepared_data_to_html(vis_data)
コード例 #9
0
ファイル: lab4.py プロジェクト: UppsalaIM/2IS060
def visualize_lda_model():
    data = preprocess_to_lemmatization()
    stopwords_verbs = [
        'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see',
        'want', 'come', 'take', 'use', 'would', 'can'
    ]
    stopwords_other = [
        'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also',
        'copyright', 'something'
    ]
    my_stopwords = stopwords.words(
        'english') + stopwords_verbs + stopwords_other
    data['tokens'] = data['tokens_sentences_lemmatized'].map(
        lambda sentences: list(chain.from_iterable(sentences)))
    data['tokens'] = data['tokens'].map(lambda tokens: [
        token.lower() for token in tokens if token.isalpha() and token.lower()
        not in my_stopwords and len(token) > 1
    ])
    tokens = data['tokens'].tolist()
    bigram_model = Phrases(tokens)
    trigram_model = Phrases(bigram_model[tokens], min_count=1)
    tokens = list(trigram_model[bigram_model[tokens]])

    dictionary_LDA = corpora.Dictionary(tokens)
    dictionary_LDA.filter_extremes(no_below=3)
    corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]
    np.random.seed(123456)
    num_topics = 20
    lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=4, alpha=[0.01]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()))
    lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA)
    pyLDAvis.enable_notebook()
    return pyLDAvis.display(lda_viz)
コード例 #10
0
def visual_lda():
    lda = LdaMulticore.load("../model/lda.model")
    with open("../result/ad_issue_reviews") as fin:
        reviews = json.load(fin)
    # build bag-of-words, corpus
    reviews = [[word for word in review if word not in stopwords.words('english')] for review in reviews]
    from collections import defaultdict
    freq = defaultdict(int)
    for review in reviews:
        for token in review:
            freq[token] += 1
    reviews = [[token for token in review if freq[token] > 1] for review in reviews]
    dictionary = corpora.Dictionary(reviews)
    corpus = [dictionary.doc2bow(review) for review in reviews]
    import pyLDAvis.gensim as gensimvis
    import pyLDAvis
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.display(vis_data)
コード例 #11
0
def topicmodel_allcoments():

    df_comments = get_data()

    pattern = r"http\S+"
    #df['TEXTO'] = df['TEXTO'].str.replace(pattern,'')

    df_comments['body'] = df_comments['body'].str.replace(pattern, '')

    df2 = df_comments.body
    df2 = df2.str.lower()
    pattern = r"@([A-Za-z0-9_]+)"
    df2 = df2.str.replace(pattern, '')
    #pattern = r"\b(word1|word2|word3|word4|word5|word|etc)\b"
    #df2 = df2.str.replace(pattern,'')

    elements = np.array(df2.tolist())
    tokenizer = RegexpTokenizer(r'\w+')
    es_stop = get_stop_words('es')
    p_stemmer = PorterStemmer()
    texts = []
    for i in elements:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in es_stop]
        #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stopped_tokens)
        #texts.append(stemmed_tokens)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=5,
                                               id2word=dictionary,
                                               passes=20)
    import pyLDAvis.gensim
    import pyLDAvis

    vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    pyLDAvis.display(vis_data)

    #return pyLDAvis.save_json(vis_data, 'TopicModel_allcomments.json')
    return pyLDAvis.json.dumps(vis_data)
コード例 #12
0
 def display(self):
     """
     Use advance view on document topics with salient words with pyLDAvis framework.
     :return:
     """
     vis = pyLDAvis.gensim.prepare(topic_model=self.lda_model_tfidf, corpus=self.tf_idf_corpus,
                                   dictionary=self.dictionary)
     from IPython.core.display import HTML
     html: HTML = pyLDAvis.display(vis)
     return html.data
コード例 #13
0
ファイル: document.py プロジェクト: abeautifulman/DoubleCheck
	def vectorize(self):
                '''
                args: 
                    none
                output:
                    generates an LDA topic model of the document using gensim and pyLDAvis
                '''
		# tokenize and remove stopwords
		sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text
		#sentences = Topic(raw_input('topic: ')).text # get text from wikipedia
		#stoplist  = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split())
		texts     = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences]
		
		# compute the frequency of each token
		frequency = defaultdict(int)
		for text in texts:
			for token in text:
				frequency[token] += 1

		# remove words that appear only once
		texts = [[token for token in text if frequency[token] > 1] for text in texts]
		
		# construct a gensim dictionary and corpus (bag of words)
		dictionary = corpora.Dictionary(texts)
		corpus     = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document

		# define LDA model
		lda = models.ldamodel.LdaModel( corpus       = corpus, 
						id2word      = dictionary,
						num_topics   = 10, #what should this be ???
						update_every = 1, 
						chunksize    = 10000, 
						passes       = 1 )
		
		# visualize the lda space
		vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        	pyLDAvis.display(vis_data)
       		pyLDAvis.show(vis_data)
                with open('topic_models/'+self.name+'.json', 'a+') as topic_json:
                    pyLDAvis.save_json(vis_data, topic_json)
                with open('topic_models/'+self.name+'.html', 'a+') as topic_html:
                    pyLDAvis.save_html(vis_data, topic_html)
コード例 #14
0
def show_topics(corpus):
    """
    Topics visualization
    
    Parameters
    ----------
    corpus : list
        corpus of (string) documents
    """
    dic = gensim.corpora.Dictionary(corpus)
    bow_corpus = [dic.doc2bow(doc) for doc in corpus]
    lda_model = gensim.models.LdaMulticore(bow_corpus,
                                           num_topics=4,
                                           id2word=dic,
                                           passes=10,
                                           workers=2)

    lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
    pyLDAvis.enable_notebook()
    pyLDAvis.display(lda_vis)
コード例 #15
0
ファイル: utils.py プロジェクト: klausrossmann/lda2vec-tf
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size):
    """This method will launch a locally hosted session of
    pyLDAvis that will visualize the results of our model
    
    Parameters
    ----------
    data_path : str
        Location where your data is stored.
    model : Lda2Vec
        Loaded lda2vec tensorflow model. 
    idx_to_word : dict
        index to word mapping dictionary
    freqs list: 
        Frequencies of each token.
    vocab_size : int
        Total size of your vocabulary
    """

    doc_embed = model.sesh.run(model.mixture.doc_embedding)
    topic_embed = model.sesh.run(model.mixture.topic_embedding)
    word_embed = model.sesh.run(model.w_embed.embedding)

    # Extract all unique words in order of index 0-vocab_size
    vocabulary = []
    for k, v in idx_to_word.items():
        vocabulary.append(v)

    # Read in document lengths
    doc_lengths = np.load(data_path + "/doc_lengths.npy")

    # The prepare_topics function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)

    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.display(prepared_vis_data)
コード例 #16
0
def showPyLDAvisNB(allDict, numTopics=30):
    # TODO: see if we can get ngrams into pyLDAvis

    dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics)
    data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2])
    output_notebook()
    pyLDAvis.enable_notebook(True)
    p = pyLDAvis.display(data, template_type='general')
    plt.tight_layout()

    display(p)
    return
コード例 #17
0
def textTopicmodel(n_topics=2):
    segment = segWord()
    segment = [str(w) for w in segment if len(str(w)) >= 2]
    corpus = [''.join(one) for one in segment]
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=1,
                                    max_features=1500,
                                    stop_words=None)
    tf = tf_vectorizer.fit_transform(corpus)
    words = tf_vectorizer.get_feature_names()  #提取文本的关键字
    lda = LatentDirichletAllocation(n_components=n_topics,
                                    learning_offset=50,
                                    random_state=0)
    docres = lda.fit_transform(tf)
    print('============================')
    print(docres)
    print('==========================')
    print(lda.components_)
    # pyLDAvis.enable_notebook()
    visualisation = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    # pyLDAvis.save_html(visualisation,'visualisation.html')
    pyLDAvis.display(visualisation)
    pyLDAvis.show(visualisation)
コード例 #18
0
ファイル: NlPipe.py プロジェクト: Veritogen/master_thesis
 def evaluate_pyldavis(self, model=None, use_jupyter=None):
     """
     Method for a visual evaluation of the LDA topic model using pyldavis.
     :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved
     within the class.
     :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run
     from jupyter and set the method accordingly
     :return:
     """
     if model is None:
         if self.lda_model is None:
             raise Exception(
                 "Please create a LDA model for evaluation before running this method."
             )
         model = self.lda_model
     if isinstance(model, LdaMallet):
         model = malletmodel2ldamodel(model)
     panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word)
     if use_jupyter is None:
         try:
             is_jupyter = os.environ['_'].split(
                 "/")[-1] == "jupyter-notebook"
             if is_jupyter:
                 pyLDAvis.enable_notebook()
         except KeyError:
             is_jupyter = False
         if is_jupyter:
             pyLDAvis.display(panel)
         else:
             pyLDAvis.show(panel)
     else:
         if use_jupyter:
             pyLDAvis.enable_notebook()
             pyLDAvis.display(panel)
         elif not use_jupyter:
             pyLDAvis.show(panel)
コード例 #19
0
def visuzalization(ldamodel, corpus, dictionary, num_words):
    viz = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    legend = topic_items(ldamodel, 15)

    for i, (k, v) in enumerate(legend.items()):
        plt.figure()
        plt.imshow(
            WordCloud(background_color="white").fit_words(
                ldamodel.show_topic(k, num_words)))
        plt.axis("off")
        plt.title("Topic #" + str(k + 1))
        plt.show()

    display = pyLDAvis.display(viz)

    return display
コード例 #20
0
def showPyLDAvis(allDict, notebook=True, numTopics=30):
    # TODO: see if we can get ngrams into pyLDAvis

    dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics)
    data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2])
    if notebook == True:
        output_notebook()
        pyLDAvis.enable_notebook(True)
        p = pyLDAvis.display(data, template_type='general')
        display(p)
    else:
        output_file("pyDAVis.html")
        p = pyLDAvis.show(
            data)  # displays in own window combined with output_file
        show(p)
    return
コード例 #21
0
 def py_lda_vis(column,
                lib,
                lda_models,
                dtm=None,
                vectorizer=None,
                corpus=None,
                dictionary=None):
     if lib == 'sklearn':
         vis_data = pyLDAvis.sklearn.prepare(lda_models[column],
                                             np.asmatrix(dtm[column]),
                                             vectorizer[column],
                                             sort_topics=False)
     else:
         vis_data = pyLDAvis.gensim.prepare(lda_models[column],
                                            corpus[column],
                                            dictionary[column],
                                            sort_topics=False)
     display(pyLDAvis.display(vis_data))
コード例 #22
0
 def fit(
     self,
     num_topics,
     alpha="symmetric",
     beta=None,
     passes=2,
     random_state=9,
     tuning=False,
     predict_training_samples=False,
 ):
     self.model = models.ldamodel.LdaModel(
         self.bow_corpus,
         num_topics=num_topics,
         alpha=alpha,
         eta=beta,
         id2word=self.dictionary,
         passes=passes,
         random_state=random_state,
     )
     # calculate perplexity score (the lower the better)
     self.perplexity_score_ = self.model.log_perplexity(self.bow_corpus)
     #         # calculate coherence score (the higher the better)
     self.coherence_score_, self.coherence_score_per_topic_ = self.score(
         self.docs, return_per_topic=True)
     if not tuning:
         pyLDAvis.enable_notebook()
         vis = pyLDAvis.gensim.prepare(self.model, self.bow_corpus,
                                       self.dictionary)
         self.visualize_topics_ = pyLDAvis.display(vis)
     else:
         self.visualize_topics_ = 'Set tuning parameter in fit function to "False" to visualize LDA result!'
         return self.coherence_score_
     if predict_training_samples:
         (
             self.training_samples_predict_proba_,
             self.training_samples_prediction_,
         ) = self.predict(self.texts, True)
コード例 #23
0
    def visualize_lda_to_html(
            self,
            target_topic_num,
            top_n=10,
            r_normalized=False,
            relevence_lambda_val=.6,
            workers_n=2,
            random_seed=1,
            savepath='./',
            filename_affix='lda',
            # save_type='html',  # {'html', 'json'}
            save_relevent_terms_ok=True,
            save_html_ok=True,
            display_ok=False,
            ):
        """
        Run `pyLDAvis.prepare` & get adjusted scores(use saliency & relevence) of terms by each topic.

        Parameters
        ----------

        target_topic_num: int
            A topic number of LDA model to visualize.

        top_n: int (default: `10`)
            A number of the most relevent terms in a topic.

        r_normalized: bool (default: `False`)
            Use normalized probabilities when it is `True`. (not recommended in most cases.)

        relevence_lambda_val: float (defautl: `.6`).
            A lambda value(ratio) to calculate relevence.

        workers_n: int (default: `2`)
            A number of CPU cores to calculate(`pyLDAvis.prepare`)

        random_seed: int (default: `1`)
            A random seed number.

        savepath: str (default: `'./'`)
            A dirpath to save `pyLDAvis` or other `pandas.DataFrame`s.

        filename_affix: str (default: `'lda'`)
            An affix of filename to save `pyLDAvis` html or json.

        save_relevent_terms_ok: bool (default: `True`)
            An option to save `pandas.DataFrame` of `top_relevent_terms`.

        save_html_ok: bool (default: `True`)
            An option to save html.

        display_ok: bool (default: `False`)
            Call `pyLDAvis.display` when it is `True`.

        References
        ----------

        Saliency: 
            `Chuang, J., 2012. Termite: Visualization techniques for assessing textual topic models`

        Relevence:
            `Sievert, C., 2014. LDAvis: A method for visualizing and interpreting topics`

        Example
        -------

        >>> import unipy_nlp.analyze.topic_modeling as utpm
        >>> tpm = utpm.TopicModeler(sentence_list, tokenized)
        >>> tpm.pick_best_lda_topics(
        ...     num_topic=5,
        ...     workers_n=8,
        ...     random_seed=1,
        ... )
        >>> tpm.visualize_lda_to_html(
        ...     7,
        ...     top_n=10,
        ...     r_normalized=False,
        ...     relevence_lambda_val=.6,
        ...     workers_n=8,
        ...     random_seed=1,
        ...     savepath='data/_tmp_dump/topic_modeling',
        ...     filename_affix='lda',
        ...     save_relevent_terms_ok=True,
        ...     save_html_ok=True,
        ...     display_ok=False,
        ... )

        """
        if target_topic_num in self.lda_model_dict.keys():
            self.selected_topic_num = target_topic_num
            self.selected_model = (
                self.lda_model_dict[target_topic_num]['model']
            )
        else:
            raise KeyError("Model doesn't exist. Select a proper number.")

        (vis_prepared,
         total_terms_df,
         top_relevant_terms_df,
         r_adj_score_df,
         bow_score_list) = self._get_terminfo_table(
            self.selected_model,
            corpus=self.bow_corpus_doc,
            dictionary=self.corpora_dict,
            doc_topic_dists=None,
            use_gensim_prepared=True,
            top_n=top_n,
            r_normalized=r_normalized,
            relevence_lambda_val=relevence_lambda_val,
            workers_n=workers_n,
            random_seed=random_seed,
        )

        self.vis_prepared = vis_prepared
        self.total_terms_df = total_terms_df
        self.top_relevant_terms_df = top_relevant_terms_df
        self.r_adj_score_df = r_adj_score_df
        self.bow_score_list = bow_score_list

        if save_html_ok:
            os.makedirs(savepath, exist_ok=True)
            ldavis_filename_html_str = os.path.join(
                savepath,
                f'{filename_affix}_topics-{target_topic_num}.html',
            )
            pyLDAvis.save_html(
                self.vis_prepared,
                ldavis_filename_html_str,
            )
            print(f"LDAVIS HTML Saved: '{ldavis_filename_html_str}'")

        if save_relevent_terms_ok:
            os.makedirs(savepath, exist_ok=True)
            ldavis_filename_rdf_str = os.path.join(
                savepath,
                '_'.join([
                    f'{filename_affix}',
                    f'topics-{target_topic_num}',
                    f'top{top_n}_relevent_terms_df.csv',
                ]),
            )
            self.top_relevant_terms_df.to_csv(
                ldavis_filename_rdf_str,
                index=True,
                header=True,
                encoding='utf-8',
            )
            print(f"LDAVIS DF Saved: '{ldavis_filename_rdf_str}'")

        if display_ok:
            pyLDAvis.display(self.vis_prepared, local=False)
コード例 #24
0
def pyLDAvisData(lda, num_topics, len_vocab, corpus, text, dictionary_tokens):
    data = {'topic_term_dists':topic_term_dists(lda,num_topics,len_vocab), 
            'doc_topic_dists': doc_topic_dists(corpus, lda),
            'doc_lengths': doc_lengths(text),
            'vocab': get_vocabularyAlpha(dictionary_tokens),
            'term_frequency':get_term_frequency(corpus)
           }
    return data
  
# 1 - PyLDAvis
import pyLDAvis

data = pyLDAvisData(lda, 5, len(dictionary.token2id), corpus, texts, dictionary.token2id)
topics_model_data = data
topics_vis_data = pyLDAvis.prepare(**topics_model_data)
pyLDAvis.display(topics_vis_data)


# 2 - Tendance des topics
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline 

def get_topic_apperences_year_month(data, info):
    dict_topics = {}
    for i in range(0,len(data)):
        idt = data[i].index(max(data[i]))
        dict_topics.setdefault(idt, []).append(info[i][0][7:14])
    return dict_topics

def get_topic_apperences_year(data, info):
コード例 #25
0
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim

warnings.simplefilter('ignore')

# Convert reviews into bag of words
total_review_text = pd.DataFrame(list(business_reviews.items()), 
                                 columns = ['business_id', 'review']).review.apply(tokenize_text)
# Create dictionary of words
dictionary = corpora.Dictionary(total_review_text)
# Compute the term frequency of terms in each document
corpus = [dictionary.doc2bow(review) for review in total_review_text]
# Compute LDA model (num_topics = 4, since we want to compare the topics to the previous 4 wordclouds)
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics = 4, id2word = dictionary, passes = 10)
print('The words and scores defining each topic are:')
lda_model.print_topics(num_topics = 4, num_words = 8)


# In[23]:


vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)


# After using the LDA algorithm to find 4 large topics, it can be observed that the topics do indeed have a number of similar words shown in wordclouds (which is created through Louvain for partitioning and TF-IDF for scoring). 
# 
# For example, topic 4 presented here is clearly showing words related to food and dessert, such as: '_salad_', '_steak_' or '_buffet_', which is very similar to the words shown in the wordcloud for community 2. 
コード例 #26
0
                startrow=0)
print('LDA_result_pos 成功输出!\n')
# 负面主题分析
neg_dict = corpora.Dictionary(neg)
neg_corpus = [neg_dict.doc2bow(i) for i in neg]
neg_lda = models.LdaModel(neg_corpus,
                          num_topics=10,
                          id2word=neg_dict,
                          passes=10)
for i in range(10):
    print('neg_topic' + ' ' + str(i + 1) + ' : ')
    print(neg_lda.print_topic(i))
LDA_result_neg = neg_lda.print_topics(num_topics=10, num_words=10)
df_neg = pd.DataFrame(data=LDA_result_neg)
df_neg.to_excel('LDA_result_neg.xlsx')
print('LDA_result_neg 成功输出!\n')

# =================主题聚类可视化==================
data2 = pyLDAvis.gensim.prepare(pos_lda, pos_corpus, pos_dict)
print('以下是正面可视化参数\n')
print(data2)
pyLDAvis.save_html(data2, 'postopic.html')
pyLDAvis.display(data2)
pyLDAvis.show(data2, open_browser=True)
# data1 = pyLDAvis.gensim.prepare(neg_lda, neg_corpus, neg_dict)
# print('以下是负面可视化参数\n')
# print(data1)
# pyLDAvis.save_html(data1, 'negtopic.html')
# pyLDAvis.display(data1)
# pyLDAvis.show(data1, open_browser=True)
コード例 #27
0
from gensim import corpora, models
import pyLDAvis.gensim
import pyLDAvis

dic = corpora.Dictionary.load('data/model/newsgroups.dict')
corp = corpora.MmCorpus('data/model/newsgroups.mm')
lda = models.ldamodel.LdaModel.load('data/model/newsgroups_50.model')

# Prepare the data for the visualization
newsgroup_data = pyLDAvis.gensim.prepare(lda, corp, dic)

# Create the visualization
pyLDAvis.display(newsgroup_data)

# Save the visualization as a html file 
pyLDAvis.save_html(newsgroup_data, 'data/model/newsgroup_ldavis.html')
コード例 #28
0
 def display(self) -> None:
     pyLDAvis.display(self.get_vis())
コード例 #29
0
 def visualization(self):
     zit = pyLDAvis.sklearn.prepare(self.lda, self.X, self.vectorizer)
     return (pyLDAvis.display(zit))
コード例 #30
0
ファイル: Topics.py プロジェクト: luminescent/ratesetter
rn = ReviewNormalizer()
normalized_reviews = [rn.tokenize(r)
                      for r in reviews]
pretty_print_html([" ".join(normalized_reviews[randint(0, len(normalized_reviews))]), 
                   " ".join(normalized_reviews[randint(0, len(normalized_reviews))])])


# #### Training the model (this might take a while...)

# In[12]:

dictionary = corpora.Dictionary(normalized_reviews)
corpus = [dictionary.doc2bow(r)
          for r in normalized_reviews]
lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=100)


# #### Prepare data and visualize!

# In[14]:

prepared_data = prepare(lda, corpus, dictionary)
pyLDAvis.display(prepared_data)


# In[ ]:



コード例 #31
0
ファイル: user_based_lda.py プロジェクト: askren/panderersbox
def get_vis(model,corpus,dictionary):
    vis=pyLDAvis.gensim.prepare(model,corpus,dictionary)
    pyLDAvis.display(vis)
    pyLDAvis.save_html(vis,configuration.lda_dir + 'lda_visualization_test.html')
tokens_after_lemmas_and_rm_stopwords = open('tokens_after_lemmas_and_rm_stopwords.txt', 'w')
for item in texts:
    tokens_after_lemmas_and_rm_stopwords.write("%s\n" % item)
    
dictionary.save_as_text('lemmas_nostopwords_with_otherdatacleaning_dictionary_' + sys.argv[2] + '.txt')

corpora.MmCorpus.serialize('lemmas_nostopwords_corpus_'+ sys.argv[2] +'.mm', corpus)
    
joblib.dump(lda, 'ldamodel_'+ sys.argv[2]+ '.pkl')


# In[6]:

print(corpus[56])


# In[4]:

dictionary = gensim.corpora.Dictionary.load_from_text('lemmas_nostopwords_with_otherdatacleaning_dictionary_1000000.txt')
corpus = gensim.corpora.MmCorpus('lemmas_nostopwords_corpus_1000000.mm')
lda = joblib.load('ldamodel_1000000.pkl')

(lda.print_topics(num_topics=20, num_words=8))


# In[4]:

lda_vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_vis)

コード例 #33
0
                                                             1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

# calculate doc lengths as the sum of each row of the dtm
doc_lengths = count_data.sum(axis=1)
doc_lengths = doc_lengths.flatten()
doc_lengths = doc_lengths.tolist()[0]
len(doc_lengths)
# transpose the dtm and get a sum of the overall term frequency
dtm_trans = count_data.T
total = dtm_trans.sum(axis=1)
total = total.flatten()
total = total.tolist()[0]
len(total)
len(vocab)

data = {
    'topic_term_dists': model.topic_word_,
    'doc_topic_dists': model.doc_topic_,
    'doc_lengths': doc_lengths,
    'vocab': vocab,
    'term_frequency': list(total)
}
# prepare the data
tef_vis_data = pyLDAvis.prepare(**data)

# this bit needs to be run after running the earlier code for reasons
pyLDAvis.display(tef_vis_data)

pyLDAvis.save_html(tef_vis_data, './guidedldavis_prepared_250k' + '.html')
コード例 #34
0
ファイル: pyldavis.py プロジェクト: jb-diplom/UvA-Papers
import json
import numpy as np
import pyLDAvis
# TODO for readme
# conda install -c conda-forge pyldavis
from bokeh.io import show, output_notebook, output_file


def load_R_model(filename):
    with open(filename, 'r') as j:
        data_input = json.load(j)
    data = {
        'topic_term_dists': data_input['phi'],
        'doc_topic_dists': data_input['theta'],
        'doc_lengths': data_input['doc.length'],
        'vocab': data_input['vocab'],
        'term_frequency': data_input['term.frequency']
    }
    return data


f = output_file("pyDAVis.html")
# output_notebook() # TODO for use in notebook
# pyLDAvis.enable_notebook()
movies_model_data = load_R_model('data/movie_reviews_input.json')

movies_vis_data = pyLDAvis.prepare(**movies_model_data)
p = pyLDAvis.display(movies_vis_data)  # should use this in notebook
# p=pyLDAvis.show(movies_vis_data) # displays in own window combined with output_file
show(p)
コード例 #35
0
    # list(map(load_doc, [glob('notebooks/pyLDAvis/data/20news-bydate-train/*/*')[0]]))
    # docs = pd.DataFrame(list(map(load_doc, glob('notebooks/pyLDAvis/data/20news-bydate-train/*/*')))).set_index(['group', 'id'])

    docs.head()

    # %%

    # %%
    docs = docs[docs.astype(str)["tokens"] != '[]']  # remove empty letters
    dictionary, corpus = prep_corpus(docs['tokens'])
    MmCorpus.serialize('courrier.mm', corpus)
    dictionary.save('courrier.dict')

    # %%
    num_topics = 5
    lda = models.ldamodel.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=num_topics,
                                   passes=10)

    lda.save(f'courrier_{num_topics}_lda.model')

    import pyLDAvis.gensim as gensimvis
    import pyLDAvis

    #%%

    lda.load(f'courrier_{num_topics}_lda.model')
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.display(vis_data)
コード例 #36
0
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [p_stemmer.stem(t) for t in filtered_tokens]
    return stems

from gensim import corpora, models, similarities 
#tokenize
token_emails = [tokenize_and_stem(text) for text in clean_emails]

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(token_emails)

#remove extremes
dictionary.filter_extremes(no_below=1, no_above=0.8)

dictionary.compactify()

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in token_emails]
final=models.ldamodel.LdaModel.load('output/final_topic10.model')
import pyLDAvis.gensim as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(final, corpus, dictionary)
pyLDAvis.display(vis_data)