Ejemplo n.º 1
0
def files_10():
    #Change number to select file
    count = 7
    data = pd.read_csv('file' + str(count) + '.csv')
    df = pd.DataFrame(data)
    for index, c in df.iterrows():
        preprocess(c['"QUESTION' + str(count) + '"'])
    print(result)
    text = 'Question ' + str(count)
    with open(text, "w") as result_file:
        result_file.write('')
    dictionary = gensim.corpora.Dictionary(result)
    bow_corpus = [dictionary.doc2bow(doc) for doc in result]
    bow_doc_x = bow_corpus[0]
    for i in range(len(bow_doc_x)):
        print("Word {} (\"{}\") appears {} time.".format(
            bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]))
    lda_model = gensim.models.LdaMulticore(bow_corpus,
                                           num_topics=6,
                                           id2word=dictionary,
                                           passes=10,
                                           workers=2,
                                           per_word_topics=True)

    for idx, topic in lda_model.print_topics(-1):
        with open(text, "a") as result_file:
            result_file.write("Topic: {} \nWords: {}".format(idx, topic) +
                              "\n")
    vis = pyLDAvis.gensim.prepare(topic_model=lda_model,
                                  corpus=bow_corpus,
                                  dictionary=dictionary)
    pyLDAvis.enable_notebook()
    pyLDAvis.show(vis)
Ejemplo n.º 2
0
def lda(doctors, topic):

    refined_tweets = lda_user(doctors)
    refined_tweets += lda_topic(topic)

    tokenizer = RegexpTokenizer(r'\w+')
    texts = []
    for i in range(0, len(refined_tweets)):
        texts.append(tokenizer.tokenize(refined_tweets[i]))
    keywordArray = []
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    corpus = [dictionary.doc2bow(text) for text in texts]

    m = models.LdaModel(corpus,
                        id2word=dictionary,
                        num_topics=3,
                        update_every=5,
                        chunksize=10000,
                        passes=10)
    topics_matrix = m.show_topics(formatted=True, num_words=5)
    topics_matrix = np.array(topics_matrix)
    #for i in range(0,20,1):
    #	print topics_matrix[i,1]

    #keywordArray = topics_matrix[:,:,1]
    #keywordArrayProb = topics_matrix[:,:,0]

    p = pyLDAvis.gensim.prepare(m, corpus, dictionary)
    pyLDAvis.show(p)
Ejemplo n.º 3
0
def generate_ldavis_data_v1(data_path, run_name, model, idx_to_word, freqs,
                            vocab_size):
    """This function will launch a locally hosted session of pyLDAvis to visualize the results of our model"""
    doc_embed = model.sess.run(model.doc_embedding)
    topic_embed = model.sess.run(model.topic_embedding)
    word_embed = model.sess.run(model.word_embedding)

    # Extract all unique words in order of index: 0 - vocab_size
    vocabulary = []
    # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token
    for i in range(1, vocab_size + 1):
        vocabulary.append(idx_to_word[i])

    # Read document lengths
    doc_lengths = np.load(data_path / run_name / 'doc_lengths.npy')

    # The `prepare_topics` function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)
    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.show(prepared_vis_data)
Ejemplo n.º 4
0
def lda(doctors, topic):
	
	refined_tweets = lda_user(doctors)
	refined_tweets += lda_topic(topic)

	tokenizer = RegexpTokenizer(r'\w+')
	texts = []
	for i in range(0,len(refined_tweets)):
		texts.append(tokenizer.tokenize(refined_tweets[i]))
	keywordArray = []
	dictionary = corpora.Dictionary(texts)
	dictionary.filter_extremes(no_below=2, no_above=0.8)
	corpus = [dictionary.doc2bow(text) for text in texts]

	
	m = models.LdaModel(corpus,id2word=dictionary,num_topics=3,update_every=5,chunksize=10000,passes=10)
	topics_matrix = m.show_topics(formatted=True, num_words=5)
	topics_matrix = np.array(topics_matrix)
	#for i in range(0,20,1):
	#	print topics_matrix[i,1]

	#keywordArray = topics_matrix[:,:,1]
	#keywordArrayProb = topics_matrix[:,:,0]
	
	p = pyLDAvis.gensim.prepare(m,corpus,dictionary)
	pyLDAvis.show(p)
Ejemplo n.º 5
0
def generate_ldavis_data(data_path, run_name, model, idx_to_word, freqs,
                         vocab_size):
    """This method will launch a locally hosted session of
    pyLDAvis that will visualize the results of our model
    """
    doc_embed = model.sesh.run(model.doc_embedding)
    topic_embed = model.sesh.run(model.topic_embedding)
    word_embed = model.sesh.run(model.word_embedding)

    # Extract all unique words in order of index 0-vocab_size
    vocabulary = []
    for i in range(vocab_size):
        vocabulary.append(idx_to_word[i])

    # Read in document lengths
    doc_lengths = np.load(data_path + "/" + run_name + "/" + "doc_lengths.npy")

    # The prepare_topics function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)

    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.show(prepared_vis_data)
Ejemplo n.º 6
0
def get_lda(data, n_components=5, n_features=6000):
    ####LDA主题模型实现
    # n_features = 3000  #使用关键词数
    n_components = 5
    #文本向量化
    tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                    max_features=n_features,
                                    stop_words='english',
                                    max_df=0.5,
                                    min_df=10)
    tf = tf_vectorizer.fit_transform(data.fcjg)

    ##控制主题数
    lda = LatentDirichletAllocation(n_components=n_components,
                                    max_iter=50,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)

    n_top_words = 300  #每个主题显示多少个词
    tf_feature_names = tf_vectorizer.get_feature_names()
    data_list = print_top_words(lda, tf_feature_names, n_top_words)
    data_plot = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    pyLDAvis.show(data_plot)
    return tf_feature_names, data_list
Ejemplo n.º 7
0
    def LDAvis(self, model,
                     save_plot=True,
                     save_dir='results',
                     filename='',
                     ext='.html',
                     show_plot=True,
                     is_notebook=True,
                     mds='mds',
                     sort_topics=False,
                     **kwargs):
        '''
        Use pyLDAvis to visualize clustering
        '''

        print('Rendering visualization...')

  

        vis = gensimvis.prepare(model, self.bow, self.gensim_dict, mds=mds, sort_topics=sort_topics, **kwargs)
        
        if save_plot:
            if len(filename) == 0:
                filename = 'LDAvis_plot_'                                                     
                full_path = save_folder_file(save_dir, filename, ext=ext, 
                                             optional_folder='LDAvis_plots')
            if ext == '.html':
                pyLDAvis.save_html(vis, full_path)
            else:
                print('File extension not supported')  
        
        if show_plot:              
            if is_notebook:
                return(vis)  # show          
            else:
                pyLDAvis.show(vis)  
Ejemplo n.º 8
0
def plot_pyldavis(topic_model, document_topic_matrix, document_term_matrix, file=None, **kwargs):
    """
    Generate a pyLDAvis visualization of the given topic model. For more information about the visualization read the `original paper <http://www.aclweb.org/anthology/W14-3110>`_ by Sievert and Shirley. Note that pyLDAvis only supports LDA models,
     passing a nmf model will cause an exception.

    :param document_topic_matrix: A document-topic matrix as returned by calling get_document_topic_matrix() on a topic model.
    :type document_topic_matrix: np.ndarray
    :param document_term_matrix: Term count weighted document-term matrix of the documents used to infer the document_topic_matrix.
    :type document_term_matrix: np.ndarray
    :param file: Path to store the HTML output. If no file is passed the plot is visualized in the browser.
    :type file: str
    :param kwargs: Further parameters passed directly to pyLDAvis's prepare function. See the `documentation <http://pyldavis.readthedocs.io/en/latest/modules/API.html#pyLDAvis.prepare>`_ for options. Note, that sort_topics=False is already set.
    """
    if topic_model.model_name != 'lda':
        raise Exception('pyLDAvis only supports LDA. {} not supported'.format(topic_model.model_name))
    topic_token_matrix = topic_model.get_topic_token_matrix(normalize=True)
    id2word = topic_model.id2token

    document_lengths = np.sum(document_term_matrix, axis=1).getA1()
    term_frequencies = np.sum(document_term_matrix, axis=0).getA1()
    prepared_data = pyLDAvis.prepare(topic_token_matrix, document_topic_matrix, document_lengths, id2word,
                                     term_frequencies, sort_topics=False, **kwargs)

    ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
    REPORT_DIR = os.path.join(ROOT_DIR, 'reports')
    if file:
        base_path = os.path.join(REPORT_DIR, 'figures/pyLDAvis')
        pa = os.path.join(base_path, file)
        with open(pa, 'w') as f:
         pyLDAvis.save_html(prepared_data, f)
    else:
        pyLDAvis.show(prepared_data)
Ejemplo n.º 9
0
def LDA_model(vectorized_data, num_topics=20):
    # Create Dictionnary
    texts = [text.split() for text in vectorized_data]
    id2word = corpora.Dictionary(texts)
    #remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
    id2word.filter_extremes(no_below=1, no_above=0.8)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    # Model
    lda_model = models.LdaModel(corpus,
                                num_topics=num_topics,
                                id2word=id2word,
                                update_every=1,
                                chunksize=100,
                                passes=10,
                                random_state=0,
                                per_word_topics=True)

    topics_per_cluster = lda_model.show_topics(formatted=False, num_words=20)
    All_topics = []
    for idx, topic in enumerate(topics_per_cluster):
        print('Topic: {} \nWords: {}'.format(
            idx, '|'.join([w[0] for w in topic[1]])))
        All_topics.append([w[0] for w in topic[1]])
        print('----next topic-----')

    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.show(vis)
Ejemplo n.º 10
0
def topic_visual(best_lda_model, data_vectorized, vectorizer):
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(best_lda_model,
                                     data_vectorized,
                                     vectorizer,
                                     mds='tsne')
    pyLDAvis.show(panel)
Ejemplo n.º 11
0
    def vis(self):
        """
        Visualization of the data through browser.
        """

        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word)
        pyLDAvis.show(vis)
Ejemplo n.º 12
0
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size):
    """
    This function will launch a locally hosted session of pyLDAvis to visualize the results of our model.

    :param data_path: (PosixPath) data location
    :param model: TensorFlow model
    :param idx_to_word: (dict) index-to-word mapping
    :param freqs: (list) frequency counts of each token
    :param vocab_size: (int) size of vocabulary
    :return:
    """
    doc_embed = model.sess.run(model.doc_embedding)
    topic_embed = model.sess.run(model.topic_embedding)
    word_embed = model.sess.run(model.word_embedding)

    # Extract all unique words in order of index: 1 - (vocab_size + 1)
    # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token
    vocabulary = ['<PAD>']
    for i in range(1, vocab_size):
        vocabulary.append(idx_to_word[i])

    # Read document lengths
    doc_lengths = np.load(data_path / 'doc_lengths.npy')

    # The `prepare_topics` function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)
    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.show(prepared_vis_data)
Ejemplo n.º 13
0
    def visualize_topics(self, notebook_mode: bool = False, mds: str = 'pcoa'):
        """
        Print important topics based on decomposition.

        Parameters
        ----------
        mds : str, optional (default='pcoa')
            2D Decomposition. Allowed values:

            * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)
            * ``'mmds'`` - Dimension reduction via Multidimensional scaling
            * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding
        """

        if not isinstance(self.comp, LatentDirichletAllocation):
            raise ValueError('only support lda_topic_modelling()')

        import pyLDAvis
        import pyLDAvis.sklearn

        if notebook_mode:
            pyLDAvis.enable_notebook()

        prepared_vis_data = pyLDAvis.sklearn.prepare(self.comp,
                                                     self._vectors,
                                                     self.vectorizer,
                                                     mds=mds)
        if notebook_mode:
            return prepared_vis_data
        else:
            pyLDAvis.show(prepared_vis_data)
Ejemplo n.º 14
0
    def compute_lda(self, file_name, n_topics, alpha, beta):
        print("STATUS: START BUILDING MODEL")
        path_base = "data/topic_modeling/"
        path_lda_model = path_base + file_name + "-lda_model.pckl"
        if os.path.exists(path_lda_model):
            f = open(path_lda_model, "rb")
            lda_model = pickle.load(f)
            f.close()
            print(
                "STATUS: FINISHED BUILDING MODEL (USING ALREADY BUILT MODEL)")
        else:
            lda_model = gensim.models.LdaMulticore(
                corpus=self.corpus,
                id2word=self.id2word,
                num_topics=n_topics,
                random_state=100,
                chunksize=100,
                passes=10,
                alpha=alpha,
                eta=beta,
            )
            f = open(path_lda_model, "wb")
            pickle.dump(lda_model, f)
            f.close()
            print("STATUS: FINISHED BUILDING MODEL (NEW MODEL CREATED)")

        # Visualize the topics
        print("STATUS: START VISUALIZING MODEL")
        data = pyLDAvis.gensim.prepare(lda_model, self.corpus, self.id2word)
        pyLDAvis.show(data)
Ejemplo n.º 15
0
def plot_lda_vis(model_data, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, save_html, show
    model_vis_data = prepare(**model_data)
    if mode == 'save_html' and filename:
        save_html(model_vis_data, filename)
    else:
        show(model_vis_data)
Ejemplo n.º 16
0
def create_models(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=10)
    print(lda.show_topics())
    vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    print(vis_data)
    pyLDAvis.show(vis_data)
Ejemplo n.º 17
0
def visLDA(model, data, vectorizer, ip, port):
    # https://github.com/bmabey/pyLDAvis/issues/69
    visData = pyLDAvis.sklearn.prepare(model,
                                       data,
                                       vectorizer,
                                       mds='mmds',
                                       sort_topics=False)
    pyLDAvis.show(visData, ip=ip, port=port)
Ejemplo n.º 18
0
    def visualize(self):
        """ Start local web-server and display LDA fitted model """

        self.check_model()
        show(
            prepare(self.model,
                    self.vectorized_data,
                    self.vectorizer,
                    mds='tsne'))
Ejemplo n.º 19
0
def topic_model(clean_txt: list, num_count: int):
    """Визуализация тематической модели"""
    clean_txt = [clean_txt]
    common_dictionary = Dictionary(clean_txt)
    common_corpus = [common_dictionary.doc2bow(text) for text in clean_txt]
    lda = LdaModel(common_corpus, num_topics=num_count)
    vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary)
    pyLDAvis.save_html(vis, 'LDA.html')
    pyLDAvis.show(data=vis, open_browser=True)
def plot_using_ldaviz(topics=10):
    texts = [[text] for text in bow.columns]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lda = models.LdaModel(corpus_tfidf, num_topics=topics, id2word=dictionary)
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.show(vis_data)
Ejemplo n.º 21
0
 def visualize(self, _label):
     visfile = "./models/{0}.vis".format(_label)
     if os.path.isfile(visfile):
         vis_data = pickle.load(open(visfile, "rb"))
     else:
         dictionary, corpus, ldamodel = self.build(_label)
         vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
         pickle.dump(vis_data, open(visfile, "wb"))
     pyLDAvis.show(vis_data)
Ejemplo n.º 22
0
def topic_model_visualize(textlist: list, num_topics: int) -> None:
    """Визуализация тематической модели"""
    textlist = [textlist]
    common_dictionary = Dictionary(textlist)
    common_corpus = [common_dictionary.doc2bow(text) for text in textlist]
    lda = LdaModel(common_corpus, num_topics=num_topics)

    vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary)
    pyLDAvis.save_html(vis, 'LDA.html')
    pyLDAvis.show(data=vis, open_browser=True)
Ejemplo n.º 23
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)
    if mode == 'save_html' and filename:
        logging.info("Saving pyLDAVis to {}".format(filename))
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
Ejemplo n.º 24
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)
    if mode == 'save_html' and filename:
        logging.info("Saving pyLDAVis to {}".format(filename))
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
Ejemplo n.º 25
0
    def visualize(self):
        """ Start local web-server to display the LDA fitted model """

        if not self.fitted:
            raise ValueError('LDA model is not fitted')

        show(
            prepare(self.lda,
                    self.vectorized_data,
                    self.vectorizer,
                    mds='tsne'))
Ejemplo n.º 26
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)

    if mode == 'save_html' and filename:
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data)
Ejemplo n.º 27
0
def LDA_analysis(texts, nTopics, onlyCount=True, showPic=True):
    min_ngram = 2
    max_ngram = 4
    max_df = 1.0
    min_df = 0
    max_features = 500
    learning_offset = 20

    lda = LatentDirichletAllocation(n_topics=nTopics,
                                    max_iter=50,
                                    learning_method='batch',
                                    learning_offset=learning_offset,
                                    random_state=0)

    if onlyCount:
        vectorizer = CountVectorizer(analyzer='word',
                                     ngram_range=(min_ngram, max_ngram),
                                     max_features=max_features,
                                     encoding='utf-8',
                                     strip_accents='unicode',
                                     stop_words='english',
                                     max_df=max_df,
                                     min_df=min_df)
        X = vectorizer.fit_transform(texts)
    else:
        vectorizer = TfidfVectorizer(analyzer='word',
                                     ngram_range=(min_ngram, max_ngram),
                                     max_features=max_features,
                                     encoding='utf-8',
                                     strip_accents='unicode',
                                     stop_words='english',
                                     max_df=max_df,
                                     min_df=min_df)
        X = vectorizer.fit_transform(texts)
    X_new = lda.fit_transform(X)
    feature_names = vectorizer.get_feature_names()

    print_top_words(lda, feature_names, 10)

    print lda.components_.shape
    print X_new[9]

    cosSim = cosine_similarity(X, lda.components_, False)
    print texts[9]
    print cosSim[9]

    if showPic:
        #pyLDAvis.enable_notebook()
        data_pyLDAvis = pyLDAvis.sklearn.prepare(lda, X, vectorizer)
        pyLDAvis.show(data_pyLDAvis)

    return cosSim
Ejemplo n.º 28
0
def display_page(pathname):
    print(pathname)
    if "kmean" in pathname:
        return page_kmean_layout
    elif "lda" in pathname:

        #data_prepared = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
        #, d3_url="js/d3.v3.min.js", ldavis_url="js/ldavis.js", ldavis_css_url="js/ldavis.css"
        #html = pyLDAvis.prepared_data_to_html(data_prepared)
        #print(html)
        pyLDAvis.show(
            pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer))
        return html.A("LDA visualisation on other tab for now")
Ejemplo n.º 29
0
def visualize():
    with open('../data/bow_corpus', 'rb') as input_file:
        corpus = pickle.load(input_file)

    tweet_dictionary = gensim.corpora.Dictionary.load(
        '../data/tweet_dictionary')
    """ Load model """
    model = LdaModel.load('../data/model/LDA_model_v1')
    """ Visualization """

    lda_visualization = pyLDAvis.gensim.prepare(model, corpus,
                                                tweet_dictionary)
    pyLDAvis.show(lda_visualization)
Ejemplo n.º 30
0
    def LDA_sklearn(self,
                    contents,
                    num_topics=5,
                    num_words=10,
                    max_df=0.95,
                    min_df=2,
                    max_features=1000,
                    showLDA=True):

        from sklearn.decomposition import LatentDirichletAllocation
        from sklearn.feature_extraction.text import CountVectorizer

        self.seg(contents)
        tf_vectorizer = CountVectorizer(max_df=max_df,
                                        min_df=min_df,
                                        max_features=max_features,
                                        stop_words='english')

        documents = [' '.join(content) for content in self.content_seg]
        tf = tf_vectorizer.fit_transform(documents)
        feature_names = tf_vectorizer.get_feature_names()

        model = LatentDirichletAllocation(n_components=num_topics,
                                          max_iter=5,
                                          learning_method='online',
                                          learning_offset=50.,
                                          random_state=0).fit(tf)

        self.LDA_sklearn_topics = []
        for topic_idx, topic in enumerate(model.components_):
            #print("Topic %d:" % (topic_idx))
            topic_info = '(%s, ' % (topic_idx)
            for i in topic.argsort()[:-(num_words + 1):-1]:  #返回最大的index
                topic_info += str(np.around(
                    topic[i],
                    decimals=3)) + '*' + '\"%s\"' % (feature_names[i]) + ' + '

            self.LDA_sklearn_topics.append(topic_info[:-3] + ')')
            #self.LDA_sklearn_topics.append([(feature_names[i], np.around(topic[i], decimals=3)) for i in topic.argsort()[:-(num_words+1):-1]]) #[:-a-1:-1]倒序排列,共取a个数
        #LDA可视化
        #交互图解释:一个圆圈代表一个主题,圆圈大小代表每个主题包含的文章数。
        #-->当鼠标未点到圆圈时,显示的是最重要(频率最高)的30个关键词。
        #-->当鼠标点到圆圈时,显示每个关键词在该主题下的频率。
        if showLDA == True:
            import pyLDAvis
            import pyLDAvis.sklearn

            result = pyLDAvis.sklearn.prepare(model, tf, tf_vectorizer)
            pyLDAvis.show(result)

        return self.LDA_sklearn_topics
Ejemplo n.º 31
0
 def visualize_lda(self, df, display=False):
     if self.lda_model == []:
         self.run_lda(df)
     max_features = self.tf_vectorizer.get_params()['max_features']
     n_topics = self.lda_model.get_params()['n_topics']
     vis_data = pyLDAvis.sklearn.prepare(self.lda_model,
                                         self.tf,
                                         self.tf_vectorizer,
                                         R=n_topics,
                                         n_jobs=-1)
     pyLDAvis.save_html(
         vis_data, 'plots/pyLDAvis_' + str(max_features) + 'feats_' +
         str(n_topics) + 'topics.html')
     if display:
         pyLDAvis.show(vis_data)
Ejemplo n.º 32
0
def show_model_statistics(lda_model, with_visualization=False):
    # Compute Perplexity
    #print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_lemmatized,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    # Visualize the topics
    # pyLDAvis.enable_notebook()
    if with_visualization:
        vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
        pyLDAvis.show(vis)
Ejemplo n.º 33
0
def visualize_LDA(model, corpus):
    """
    This function accepts an lda model and a corpus of words and uses pyLDAvis
    to prepare a visualization and then save to html. 
    input: an lda model and a corpus of words
    returns: None
    """
    LDAvis_prepared = pyLDAvis.gensim.prepare(model,
                                              corpus,
                                              dictionary=model.id2word,
                                              mds='tsne')
    vis_filename = './LDAvis_prepared/random_users/LDAvis.html'
    pyLDAvis.save_html(LDAvis_prepared, vis_filename)
    pyLDAvis.show(LDAvis_prepared)
    return None
Ejemplo n.º 34
0
	def vectorize(self):
                '''
                args: 
                    none
                output:
                    generates an LDA topic model of the document using gensim and pyLDAvis
                '''
		# tokenize and remove stopwords
		sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text
		#sentences = Topic(raw_input('topic: ')).text # get text from wikipedia
		#stoplist  = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split())
		texts     = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences]
		
		# compute the frequency of each token
		frequency = defaultdict(int)
		for text in texts:
			for token in text:
				frequency[token] += 1

		# remove words that appear only once
		texts = [[token for token in text if frequency[token] > 1] for text in texts]
		
		# construct a gensim dictionary and corpus (bag of words)
		dictionary = corpora.Dictionary(texts)
		corpus     = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document

		# define LDA model
		lda = models.ldamodel.LdaModel( corpus       = corpus, 
						id2word      = dictionary,
						num_topics   = 10, #what should this be ???
						update_every = 1, 
						chunksize    = 10000, 
						passes       = 1 )
		
		# visualize the lda space
		vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        	pyLDAvis.display(vis_data)
       		pyLDAvis.show(vis_data)
                with open('topic_models/'+self.name+'.json', 'a+') as topic_json:
                    pyLDAvis.save_json(vis_data, topic_json)
                with open('topic_models/'+self.name+'.html', 'a+') as topic_html:
                    pyLDAvis.save_html(vis_data, topic_html)