def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
Beispiel #2
0
    def save_lda_model(self, lda_model, corpus, dictionary):
        pyLDAvis.save_json(
            pyLDAvis.gensim.prepare(lda_model, corpus, dictionary),
            './../static/js/lda.json')
        print(lda_model.print_topics())
        lda_model.save('./../lda/model.lda')

        dictionary.save('./../lda/dict.lda')
        corpora.MmCorpus.serialize('./../lda/corpus.mm', corpus)
Beispiel #3
0
    def save_lda_model(self, lda_model, corpus, dictionary, index):

        index.save(self.lda_path + 'index.lda')
        pyLDAvis.save_json(pyLDAvis.gensim.prepare(lda_model, corpus, dictionary), self.lda_path + '/../static/js/lda.json')
        print(lda_model.print_topics())
        lda_model.save(self.lda_path + 'model.lda')

        dictionary.save(self.lda_path + 'dict.lda')
        corpora.MmCorpus.serialize(self.lda_path + 'corpus.mm', corpus)
    def save_lda_model(self, lda_model, corpus, dictionary, index):

        index.save(self.lda_path + 'index.lda')
        pyLDAvis.save_json(pyLDAvis.gensim.prepare(lda_model, corpus, dictionary), self.lda_path + '/../static/js/lda.json')
        print(lda_model.print_topics())
        lda_model.save(self.lda_path + 'model.lda')

        dictionary.save(self.lda_path + 'dict.lda')
        corpora.MmCorpus.serialize(self.lda_path + 'corpus.mm', corpus)
 def view_lda_model(self, model, corpus, dictionary):
     # corpus = [dictionary.doc2bow(doc) for doc in corpus]
     prepared_data = gensimvis.prepare(model,
                                       corpus,
                                       dictionary,
                                       mds='mmds')
     pyLDAvis.save_json(
         prepared_data,
         self.model_path + self.data_name + '_vis_result.json')
     pyLDAvis.save_html(
         prepared_data,
         self.model_path + self.data_name + '_vis_result.html')
Beispiel #6
0
 def new(cls, name: str, dataset: Dataset, model: TopicModel,
         **kwargs) -> "Visualizer":
     path = common.PROJDIR / (name + ".LDAvis.json")
     pyLDAvis.save_json(
         pyLDAvis.prepare(model.get_topic_word_matrix(normalize=True),
                          model.get_doc_topic_matrix(normalize=True),
                          dataset.get_count_matrix().sum(axis=1).squeeze(),
                          [word.decode() for word in dataset.get_vocab()],
                          dataset.get_count_matrix().sum(axis=0).squeeze(),
                          **kwargs),
         str(path),
     )
     return cls(path)
def narrative_analysis(df):
    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')  # set tokenize Reg
    en_stop = get_stop_words('en')  # create English stop words list
    p_stemmer = PorterStemmer()  # Create p_stemmer of class PorterStemmer
    texts = []  # list for tokenized documents in loop

    for index in range(0, len(df.index)):
        if str(df['narrative'].ix[index]) != 'nan':
            intext = df['narrative'].ix[index]
            intext = re.sub(r"X+", "", intext)
            raw = intext.lower()
            tokens = tokenizer.tokenize(raw)

            # remove stop words from tokens
            stopped_tokens = [i for i in tokens if not i in en_stop]

            # stem tokens and add them to list
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            texts.append(stemmed_tokens)

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=25,
                                               id2word=dictionary)
    LDAText = ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)

    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "narrative_lda.html")
    #pyLDAvis.save_json(vis_data, "narrative_lda.json")
    pyLDAvis.save_html(vis_data, "narrative_lda_2016.html")
    pyLDAvis.save_json(vis_data, "narrative_lda_2016.json")

    return 0
Beispiel #8
0
	def vectorize(self):
                '''
                args: 
                    none
                output:
                    generates an LDA topic model of the document using gensim and pyLDAvis
                '''
		# tokenize and remove stopwords
		sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text
		#sentences = Topic(raw_input('topic: ')).text # get text from wikipedia
		#stoplist  = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split())
		texts     = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences]
		
		# compute the frequency of each token
		frequency = defaultdict(int)
		for text in texts:
			for token in text:
				frequency[token] += 1

		# remove words that appear only once
		texts = [[token for token in text if frequency[token] > 1] for text in texts]
		
		# construct a gensim dictionary and corpus (bag of words)
		dictionary = corpora.Dictionary(texts)
		corpus     = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document

		# define LDA model
		lda = models.ldamodel.LdaModel( corpus       = corpus, 
						id2word      = dictionary,
						num_topics   = 10, #what should this be ???
						update_every = 1, 
						chunksize    = 10000, 
						passes       = 1 )
		
		# visualize the lda space
		vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        	pyLDAvis.display(vis_data)
       		pyLDAvis.show(vis_data)
                with open('topic_models/'+self.name+'.json', 'a+') as topic_json:
                    pyLDAvis.save_json(vis_data, topic_json)
                with open('topic_models/'+self.name+'.html', 'a+') as topic_html:
                    pyLDAvis.save_html(vis_data, topic_html)
def narrative_analysis(df):
    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop

    for index in range(0,len(df.index)):
        if str(df['narrative'].ix[index]) != 'nan':
            intext = df['narrative'].ix[index]
            intext = re.sub(r"X+", "", intext)
            raw = intext.lower()
            tokens = tokenizer.tokenize(raw)
       
            # remove stop words from tokens
            stopped_tokens = [i for i in tokens if not i in en_stop]
        
            # stem tokens and add them to list
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            texts.append(stemmed_tokens)

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "narrative_lda.html")
    #pyLDAvis.save_json(vis_data, "narrative_lda.json")
    pyLDAvis.save_html(vis_data, "narrative_lda_2016.html")
    pyLDAvis.save_json(vis_data, "narrative_lda_2016.json")

    return 0
Beispiel #10
0
    def topic_modelling(self):
        sec = self.__preprocess_text_pylda(self.text)
        dictionary = corpora.Dictionary(sec)
        doc_term_matrix = [dictionary.doc2bow(rev) for rev in (sec)]
        LDA = gensim.models.ldamodel.LdaModel

        # Build LDA model
        lda_model = LDA(corpus=doc_term_matrix,
                        id2word=dictionary,
                        num_topics=self.no_of_topics,
                        random_state=100,
                        chunksize=1000,
                        passes=50)
        #pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
        pyLDAvis.save_html(vis, directory + "\\templates\\abc.html")
        #Clodinary_url = self.upload_cloudinary(directory+"\\Text\\Output\\abc.html")
        pyLDAvis.save_json(vis, directory + "\\Text\\Output\\abc.json")
        p = list(lda_model.print_topics())
        topics_ = dict()
        for i in p:
            topics_[i[0]] = i[1]
        topic_list = dict()
        for i in topics_:
            s = topics_[i]
            s = s.split("+")
            #Sprint(s)
            t = []
            for i in s:
                q = i.split("*")
                #print(q[1][1:-2])
                t.append(q[1][1:-2])
            topic_list[i] = t
            #print(t)
            #p=[s[j] for j in range(1,len(s),2)]
            #print(p)
            #for j in range(len(s)):
            #s[j]
        Clodinary_url = "http://localhost:5000/abc.html"
        return topics_, topic_list, self.text, Clodinary_url
Beispiel #11
0
def LDA(tokens, start, stop, step=1):
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(text) for text in tokens]
    model_list = []
    coherence_values = []
    max_topic_num = 0
    for i in range(start, stop, step):
        print('steps  ', i)
        model = LdaModel(corpus, id2word=dictionary,
                         num_topics=i + 1)  #LDA model
        model_list.append(model)
        coherence_model_lda = CoherenceModel(model,
                                             texts=tokens,
                                             dictionary=dictionary,
                                             coherence='c_v')  #Coherence
        coherence_lda = coherence_model_lda.get_coherence(
        )  #calculate the coherence score
        if i is not start and coherence_lda > max(coherence_values):
            max_topic_num = i
        coherence_values.append(coherence_lda)

    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()  #show graph of coherence score by pyplot
    max_ind = coherence_values.index(max(coherence_values))
    model_list[max_ind].save("result_model")
    prepared_data = gensimvis.prepare(model_list[max_ind],
                                      corpus=corpus,
                                      dictionary=dictionary)
    pyLDAvis.save_html(prepared_data,
                       'res.html')  #save the result of LDA by html file
    pyLDAvis.save_json(prepared_data,
                       'res.json')  #save the result of LDA by JSON file
    return model_list[max_ind], coherence_values[max_ind], max_topic_num
def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List = []
    for i in range(0, 50):
        Issue_List.append(
            df_sub.groupby(['Issue'
                            ]).sum().sort_index(by='count',
                                                ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')  # set tokenize Reg
    en_stop = get_stop_words('en')  # create English stop words list
    p_stemmer = PorterStemmer()  # Create p_stemmer of class PorterStemmer
    texts = []  # list for tokenized documents in loop
    text_view = ''

    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)

        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]

        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8, 6))
    fig1 = fig.add_subplot(1, 1, 1)
    fig1.set_title("Top issued words", fontdict={'fontsize': 25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=25,
                                               id2word=dictionary)
    LDAText = ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)

    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
Beispiel #13
0
        # for each (topic,probability) for each document
        # append [(topic, probability),document id] to cluster1
        cluster1.append((x, j, l))

# Save topics
with open('lda_topic.txt', 'w') as file:
    for i in lda.show_topics(num_topics=topic_number_setup):
        file.write(str(i) + '\n')

# topic cluster visualization
# topic term relation json save
movies = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(movies, 'LDA_Visualization.html')

# Topic-Term relationship matrix
pyLDAvis.save_json(movies, 'topic_term.json')
with open('topic_term.json') as json_data:
    d = json.load(json_data)
mat = np.column_stack((d['token.table']['Topic'], d['token.table']['Freq'],
                       d['token.table']['Term']))

# load movie metadata:
meta_dict = {}
with open("movie.metadata.tsv") as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")
    for line in tsvreader:
        meta_dict[line[0]] = line[2]


# Enable topic document search
def enable_search():
Beispiel #14
0
 def write_json_data(self, ldaviz_model, n_topics):
     pyLDAvis.save_json(ldaviz_model, self.paths.ldaviz_json(n_topics))
Beispiel #15
0
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

##### Visualise topics #####
print('Visualising topics...')
# Visualise
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#15visualizethetopicskeywords

import IPython # still required
import pyLDAvis
from pyLDAvis import gensim

# Visualize the topics
# Visualize the topics in notebook
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, mds='mmds')
pyLDAvis.show(vis)

pyLDAvis.save_html(vis, 'topics-lda.html')

print('Save visualisation to json...')
with open('pylda-vis' + t + '.json', 'w') as vis_json:
    vis_json.write(pyLDAvis.save_json(vis))
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=3,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

# Visualize the topics
#pyLDAvis.enable_notebook()
print('Working on creating visualization...')
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, n_jobs=1,
                              R=50)  # n_jobs is so it uses up less cpu
print('Going to save html and json...')
pyLDAvis.save_html(vis, 'LDA_Visualization_{}.html'.format(newsSource))
pyLDAvis.save_json(vis, 'LDA_Visualization_{}.json'.format(newsSource))

# TODO NEXT: get keywords from json, save to a .txt file (as temp[NewsSource]), and format
vis_data = {}
gensim_lda_model = {}
for c in cmallet.keys():
    vis_data[c] = {}
    gensim_lda_model[c] = {}
    for i in cmallet[c].keys():
        gensim_lda_model[c][
            i] = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
                cmallet[c][i])
        vis_data[c][i] = pyLDAvis.gensim.prepare(
            gensim_lda_model[c][i],
            corpora[c],
            dictionary=cmallet[c][i].id2word,
            mds='tsne')
        pyLDAvis.save_json(vis_data[c][i], outdir + f'pyldavis_{c}_{i}.json')
        print(outdir + f'pyldavis_{c}_{i}.json')
        ofdir = web_out_dir + f'{c}-{i}/'
        os.makedirs(ofdir, mode=out_path_mode, exist_ok=True)
        pyLDAvis.save_html(vis_data[c][i],
                           ofdir + f'pyldavis_{c}_{i}.html',
                           ldavis_url=MODIFIED_LDAVIS_URL)
        print(web_out_dir + f'{c}-{i}/pyldavis_{c}_{i}.html')

# #### Save Gensim Mallet Models

# In[38]:

for c in gensim_lda_model.keys():
    for i in gensim_lda_model[c].keys():
        gensim_lda_model[c][i].save(
Beispiel #18
0
def pyLDAvisUI(lda, tf, tf_vectorizer):
    page = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    pyLDAvis.save_html(page, 'lda.html')  #将主题可视化数据保存为html文件
    pyLDAvis.save_json(page, 'lda.json')
Beispiel #19
0
    corpus=bow_corpus,
    dictionary=cdict,
    doc_topic_dist=None,
    R=30,
    lambda_step=0.2,
    mds='tsne',
    # mds=<function js_PCoA>,
    n_jobs=-1,
    plot_opts={
        'xlab': 'PC1',
        'ylab': 'PC2'
    },
    sort_topics=True,
)

LDA_HTML = f'data/lda_vis_result_{LDA_TOPIC_NUM}_topics.html'
LDA_JSON = f'data/lda_vis_result_{LDA_TOPIC_NUM}_topics.json'

pyLDAvis.save_html(prepared_data, LDA_HTML)
pyLDAvis.save_json(prepared_data, LDA_JSON)

# %%
# pyLDAvis.display(prepared_data, local=False)
print("Test: 'pyLDAvis' finished.")

# %%

print("Test: 'pyLDAvis' finished.")

#%%
                    vals = line.rstrip('\r\n').split('\t')
                    doc_id = int(vals[0])
                    word_id = int(vals[1])
                    word_count = int(vals[2])
                    doc_lengths[doc_id] += word_count
                    term_frequency[word_id] += word_count
        # Dictionary terms
        vocab = [corpus_dictionary[word_id] for word_id in range(V)]
        # Generate the JSON and html
        prepared_data = pyLDAvis.prepare(topic_term_dists,
                                         doc_topic_dists,
                                         doc_lengths,
                                         vocab,
                                         term_frequency,
                                         R=corpus_n_terms)
        pyLDAvis.save_json(prepared_data,
                           pyLDAvis_dir + '/' + json_name + '.json')
        print('Generating ' + json_name + '.html')
        generate_LDAvis_html(json_name, pyLDAvis_dir)
    else:
        print('Warning: ' + pyLDAvis_dir + '/' + json_name +
              '.json already exists')

if model == 'ngppf' or model == 'jgppf':
    # Generate the network files
    json_name = 'network'
    json_names.append(json_name)
    if not os.path.exists(pyLDAvis_dir + '/' + json_name + '.json'):
        print('Generating ' + json_name + '.json')
        # Topic-term probabilities
        topic_term_dists = P_nk(rkB, phink)
        topic_term_dists = np.array(