def simple_parse(text):

    text = text.lower()
    stop_words = stop_word_list()
    #The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(text)
    #we'll create a new list which contains punctuation we wish to clean
    punctuations = [
        '(', ')', ';', ':', '[', ']', ',', '.', '-', '\"', '\'', '{', '}',
        ' - '
    ]
    special_char = ['#', '<', '>', '*', '+', ' - ', '~', '^', '"']
    #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
    #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
    keywords = [
        word for word in tokens if not word in stop_words
        and not word in punctuations and not word in special_char
    ]
    return text, tokens, keywords
Esempio n. 2
0
def simple_parse(text):
    '''
    Tokenizes the input text and returns the raw text, a list of tokens and a list of 'keywords'.
    Keywords are tokens which are not punctuation, stop words, spaces or special characters.

    Parameters
    ----------
    text : str
        Raw text to be tokenized. 

    Returns
    ----------
    text : str
        The raw text from the file. 
    tokens : list
        A list of the tokens from the raw text. 
    keywords : list
        A list of the tokens which are not a space, stop words, punctuation or special character.


    '''

    text = text.lower()
    stop_words = stop_word_list()
    #The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(text)
    #we'll create a new list which contains punctuation we wish to clean
    punctuations = [
        '(', ')', ';', ':', '[', ']', ',', '.', '-', '\"', '\'', '{', '}',
        ' - '
    ]
    special_char = ['#', '<', '>', '*', '+', ' - ', '~', '^', '"']
    #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
    #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
    keywords = [
        word for word in tokens if not word in stop_words
        and not word in punctuations and not word in special_char
    ]
    return text, tokens, keywords
Esempio n. 3
0
def extract(filename):
    #write a for-loop to open many files -- leave a comment if you'd #like to learn how

    #filename = 'testfile.pdf'
    stop_words = stop_word_list()
    #open allows you to read the file
    pdfFileObj = open(filename, 'rb')
    #The pdfReader variable is a readable object that will be parsed
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    #discerning the number of pages will allow us to parse through all #the pages
    num_pages = pdfReader.numPages
    count = 0
    text = ""
    #The while loop will read each page
    while count < num_pages:
        pageObj = pdfReader.getPage(count)
        count += 1
        text += pageObj.extractText()
    #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
    if text != "":
        text = text
    #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
    else:
        text = textract.process(filename, method='tesseract', language='eng')
    # Now we have a text variable which contains all the text derived #from our PDF file. Type print(text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' etc.
    # Now, we will clean our text variable, and return it as a list of keywords.

    #The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(text)
    #we'll create a new list which contains punctuation we wish to clean
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '.', '-']
    #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
    #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
    keywords = [
        word for word in tokens
        if not word in stop_words and not word in punctuations
    ]
    return text, tokens, keywords
Esempio n. 4
0
def build_word_cloud(token_list, n):
    words = token_list
    stop_words = stop_word_list()

    for word in words:
        if word in stop_words:
            words.remove(word)

    wordcloud = WordCloud(
        width=1440,
        height=1080,
        background_color='white',
        #colormap="Blues",
        #margin=10,
        stopwords=stop_words,
        max_words=n,
    ).generate(str(words))

    fig = plt.figure(figsize=(20, 15))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.savefig('static/mycloud', bbox_inches='tight')
Esempio n. 5
0
def build_word_cloud(text, n):
    '''
    Plots the wordcloud from a given token list and returns the plot in html format to be embedded in a html file.

    Parameters
    ----------
    text : str
        The text in form of a string to generate the word cloud.
    n : int:
        maximum number of tokens to display in the wordcloud.

    Returns
    ----------
    Embedded html of the wordcloud visulisation. This can be simply added to a html template.

    '''

    stop_words = stop_word_list()

    wordcloud = WordCloud(
        width=1440,
        height=1080,
        background_color='white',
        #colormap="Blues",
        #margin=10,
        stopwords=stop_words,
        max_words=n,
    ).generate(str(text))

    fig = plt.figure(figsize=(13, 9))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.margins(x=0, y=0)

    html = mpld3.fig_to_html(fig, no_extras=True, template_type='general')

    return html
Esempio n. 6
0
import re
import nltk

from gensim.models import word2vec

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import matplotlib as mpl
from wordcloud import WordCloud, STOPWORDS
from stopwords import stop_word_list
from pdf_extractor import extract

import spacy

stop_words =  stop_word_list()


text, tokens, keywords = extract('uploads/mytest.pdf')


for word in tokens:
    if word in stop_words:
        tokens.remove(word)  

cleantext = " ".join(tokens)



nlp = spacy.load('en_core_web_sm')  # make sure to use larger model!
Esempio n. 7
0
def lda_tsne(total_text,
             file_names,
             n_topics=None,
             n_iter=200,
             n_top_words=None,
             threshold=0.3):

    n_data = len(file_names)

    if n_topics is None:
        n_topics = int(round(((len(file_names)) / 2)**0.5))
    # session['number_topics'] = str(n_topics)

    if n_top_words is None:
        n_top_words = 5
    #  session['number_topwords'] = str(n_top_words)

    t0 = time.time()

    stopwords = stop_word_list()
    cvectorizer = CountVectorizer(min_df=1,
                                  stop_words=stopwords,
                                  lowercase=True,
                                  ngram_range=(1, 3),
                                  max_df=30)
    cvz = cvectorizer.fit_transform(total_text)

    lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
    X_topics = lda_model.fit_transform(cvz)

    ldavis_html = pyladvis_run(lda_model, cvz, cvectorizer)

    print("<<<<<<<<<<LDAVIS OK>>>>>>>")

    print(X_topics)

    t1 = time.time()
    print('\n')

    print('LDA training done; took {} mins'.format((t1 - t0) / 60.))
    print('\n')

    # np.save('mednlp/lda_doc_topic_{}files_{}topics.npy'.format(
    #    X_topics.shape[0], X_topics.shape[1]), X_topics)

    #np.save('mednlp/lda_topic_word_{}files_{}topics.npy'.format(
    #   X_topics.shape[0], X_topics.shape[1]), lda_model.topic_word_)

    ##############################################################################
    # threshold and plot

    #_idx = np.amax(X_topics, axis=1) > threshold  # idx of news that > threshold

    #print('idx:  ' + str(_idx))
    #_topics = X_topics
    print('topics:  ' + str(X_topics))
    num_example = len(X_topics)

    print("num_example: " + str(num_example))

    # t-SNE: 50 -> 2D
    tsne_model = TSNE(n_components=2,
                      verbose=1,
                      random_state=0,
                      angle=.50,
                      init='pca')
    tsne_lda = tsne_model.fit_transform(X_topics[:num_example])

    print("TSNE_LDA")
    print(type(tsne_lda))
    print(tsne_lda)
    print(tsne_lda.shape)

    tsne_lda_df = pd.DataFrame(tsne_lda)

    tsne_lda_df = tsne_lda_df.fillna('')

    tsne_lda = tsne_lda[~np.isnan(tsne_lda).any(axis=1)]
    # find the most probable topic for each news
    _lda_keys = []
    for i in range(X_topics.shape[0]):
        _lda_keys += X_topics[i].argmax(),

    print('lda_keys:  ')
    print(_lda_keys)

    # show topics and their top words
    topic_summaries = []
    topic_word = lda_model.topic_word_  # get the topic words
    vocab = cvectorizer.get_feature_names()
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        topic_summaries.append(' '.join(topic_words))

    colormap = np.array([])

    for i in range(n_topics):
        color = "#" + "%06x" % random.randint(0, 0xFFFFFF)
        colormap = np.append(colormap, color)

    print("#########################################################")
    print("COLORMAP")
    print(colormap[_lda_keys][:num_example])
    print("#########################################################")
    print("LDA KEYS")
    print(_lda_keys[:num_example])
    print("#########################################################")

    raw_topic_summaries = []
    for x in _lda_keys:
        raw_topic_summaries.append(topic_summaries[x])

    # plot
    title = " t-SNE visualization of LDA model trained on {} files, " \
            "{} topics, thresholding at {} topic probability, {} iterations ({} data " \
            "points and top {} words)".format(
        X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words)

    plot_lda = bp.figure(
        plot_width=1200,
        plot_height=800,
        title=title,
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None,
        y_axis_type=None,
        min_border=1)

    if n_data < 30:
        dot_size = 20
    if n_data >= 30 and n_data < 50:
        dot_size = 15
    if n_data >= 50 and n_data < 150:
        dot_size = 11
    if n_data >= 150:
        dot_size = 5

    source = bp.ColumnDataSource(
        data=dict(x=tsne_lda_df.iloc[:, 0],
                  y=tsne_lda_df.iloc[:, 1],
                  color=colormap[_lda_keys][:num_example],
                  file_names=file_names,
                  raw_topic_summaries=raw_topic_summaries))
    plot_lda.scatter(x='x', y='y', color='color', source=source, size=dot_size)

    plot_lda.outline_line_width = 7
    plot_lda.outline_line_alpha = 0.3
    plot_lda.outline_line_color = "#353A40"

    # randomly choose a news (in a topic) coordinate as the crucial words coordinate
    topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
    for topic_num in _lda_keys:
        if not np.isnan(topic_coord).any():
            break
        topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

    # plot crucial words
    for i in range(X_topics.shape[1]):
        plot_lda.text(topic_coord[i, 0], topic_coord[i, 1],
                      [topic_summaries[i]])

    # hover tools
    hover = plot_lda.select(dict(type=HoverTool))
    hover.tooltips = [("file name", "@file_names"),
                      ("topic summary", '@raw_topic_summaries')]

    #save(plot_lda, '20_news_tsne_lda_viz_{}_{}_{}_{}_{}_{}.html'.format(
    #    X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words))

    t2 = time.time()
    print('\n>>> whole process done; took {} mins\n'.format((t2 - t0) / 60.))

    output_file("TSNE_OUTPUT.html", title="TTSNE OUTPUT")
    #show(plot_lda)

    script, div = components(plot_lda)

    return script, div, ldavis_html
Esempio n. 8
0
def lda_tsne(total_text, file_names, n_topics=None, n_top_words=None):
    '''
    Handles the process of applying Latent Dirichlet Allocation (LDA) to the input text and the 
    dimensionality reduction of the result from LDA using t-SNE.
    The LDA algorithm returns a document-topic probability matrix which describes the probabilities of 
    the topics in a document. 
    The result of t-SNE are x,y coordinates that can be plotted on a scatter plot to visualise the clusters. 
    The bokeh library is used for visualisation, it provides great interactive plots and hovertools that can 
    add extra information to the plot.
    It also create an html output 
    that can be easily embedded within a web page. 
    
    Parameters
    ----------
    tota_text : list
        A list of strings where each element is all the text of a  document in one string.
    file_names : list
        NA list of strings where each element is a file name of the files that were uploaded.
    n_topics: int
        This is a hyperparameter of the sklearn LDA function, it needs to know how many topics are being modelled. 
    n_top_words: int
        This is a number representing the number of words that described each topic. This is used on the bokeh hover tool. 

    Returns
    ----------
    html : str
        the html embedding of the bokeh plot, this can be directly embedded in a web page. 

    '''

    #loads the flask session variable in order to use it when object serialisations (pickle) to the filing system.
    myid = session['myid']

    n_data = len(file_names)

    #if the number of topics is not specified (like when a user first launches the clusterinfg), 
    # it uses a rule of thumb to estimate the number of topics in a corpus of documents
    #the rule of thumb is ((number of documents)/2)^0.5.
    #another option is to use a more advanced algorithm to estimate the number of topics. 
    # I have tried HDBSCAN but the result is highly dependent on its 'minimal cluster size' parameter.
    if n_topics is None:
        n_topics = int(round(((len(file_names))/2)**0.5))
        session['number_topics'] = str(n_topics)

    # if the number of top words is not specific, use 5 words to described a topic
    if n_top_words is None:
        n_top_words = 5
        session['number_topwords'] = str(n_top_words)

    #the timing is for testing, to see how long it takes to run certain functions.
    t0 = time.time()

    #loads the list of stop words
    stopwords = stop_word_list()

    #loads the Scikit-Learn countvectorizer. This will convert the input text into a document-term matrix.
    #It is a matrix that simply registers a count of the different n-grams within the text
    #When the ngram_range paramters is set to (1,1) the ngrams are only the different words without a documents.
    # so for the sentence "My name is David" the list of ngrams would be ['My', 'name', 'is', 'david']
    # if the ngram_range parameter is set to (1,2) it will also include bigrams
    # for the same sentence the ngrams would be ['My', 'My name', 'name', 'name is', 'is', 'is david', 'david']
    cvectorizer = CountVectorizer(
        min_df=1, stop_words=stopwords,  lowercase=True, ngram_range=(1, 3))

    # this creates the document-term matrix
    cvz = cvectorizer.fit_transform(total_text)

    t1 = time.time()

    print("Time for count vectorizer (document term matrix): " + str(t1-t0))

    t2 = time.time()
    # generates the lda model with 500 iterations
    lda_model = lda.LDA(n_topics, 500)

    # fits the lda model to the document-term matrix
    X_topics = lda_model.fit_transform(cvz)

    t3 = time.time()

    print("Time for LDA: " + str(t3-t2))

    if not os.path.exists('pickles'):
        os.makedirs('pickles')

    # creates the paths to which the pickled objects will be saved
    lda_model_path = "pickles/lda_model_" + str(myid) + '.p'
    document_term_matrix_path = "pickles/document_term_matrix_" + \
        str(myid) + '.p'
    cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p'

    #pickles the objects and saves them
    pickle.dump(lda_model, open(lda_model_path, "wb"))
    pickle.dump(cvz, open(document_term_matrix_path, "wb"))
    pickle.dump(cvectorizer, open(cvectorizer_path, "wb"))

    #the number of files uploaded
    num_example = len(X_topics)

    t4 = time.time()

    #creates the t-SNE object that will be used, the number of components reffers to the number of output dimensions
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.2,
                      init='pca')

    #uses t-SNE to calculate the 2-D coordinates representing the documents.
    tsne_lda = tsne_model.fit_transform(X_topics[:num_example])

    t5 = time.time()

    print("Time for TSNE: " + str(t5-t4))

    #Some fancy processing of the data using pandas to remove any 'NAN' values from the data.
    tsne_lda_df = pd.DataFrame(tsne_lda)

    print(tsne_lda_df.describe())

    tsne_lda_df = tsne_lda_df.fillna('')

    tsne_lda = tsne_lda[~np.isnan(tsne_lda).any(axis=1)]

    tsne_lda_df = tsne_lda_df[~tsne_lda_df.isin(
        [np.nan, np.inf, -np.inf]).any(1)]

    print(tsne_lda_df.describe())

    # finds the most probable topic for each document and saves it into the list
    _lda_keys = []
    for i in range(X_topics.shape[0]):
        _lda_keys += X_topics[i].argmax(),

    #gets the most probable words of each topic as a representaiton of that topic.
    topic_summaries = []
    topic_word = lda_model.components_  # get the topic words
    vocab = cvectorizer.get_feature_names()
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(
            topic_dist)][:-(n_top_words+1):-1]
        topic_summaries.append(' '.join(topic_words))

    #creates a colourmap to colour each topic in a separate randomly chosen colour
    colormap = np.array([])

    for i in range(n_topics):
        color = "#" + "%06x" % random.randint(0, 0xFFFFFF)
        colormap = np.append(colormap, color)

    raw_topic_summaries = []
    for x in _lda_keys:
        raw_topic_summaries.append(topic_summaries[x])

    t6 = time.time()
    title = " t-SNE visualization of LDA model trained on {} files, " \
            "{} topics, {} data " \
            "points and top {} words".format(
                X_topics.shape[0], n_topics, num_example, n_top_words)

    #creates the bokeh figure objects that will be used to crate the sactter plot
    plot_lda = bp.figure(plot_width=1200, plot_height=700,
                         title=title,
                         tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                         x_axis_type=None, y_axis_type=None, min_border=1)

    # defines the size of the plot dots, the more there are of them the smaller they should be
    if n_data < 30:
        dot_size = 20
    if n_data >= 30 and n_data < 50:
        dot_size = 15
    if n_data >= 50 and n_data < 150:
        dot_size = 11
    if n_data >= 150:
        dot_size = 5

    #this object defines the paramters of the plot in the form of a dictionary. The file_names and raw_topic_summaries are used
    #for the plot's hover tool.
    source = bp.ColumnDataSource(data=dict(x=tsne_lda_df.iloc[:, 0], y=tsne_lda_df.iloc[:, 1],
                                           color=colormap[_lda_keys][:num_example], file_names=file_names, 
                                           raw_topic_summaries=raw_topic_summaries))
    plot_lda.scatter(x='x', y='y',
                     color='color',
                     source=source, size=dot_size)

    plot_lda.outline_line_width = 7
    plot_lda.outline_line_alpha = 0.3
    plot_lda.outline_line_color = "#353A40"

    # randomly choses a file as the coordinate at which to show the topic words.
    topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
    for topic_num in _lda_keys:
        if not np.isnan(topic_coord).any():
            break
        topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

    # plots the top words
    for i in range(X_topics.shape[1]):
        plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [
                      topic_summaries[i]])

    #sets the bokeh's hover tool to display the file name and topic summary of 
    # a dot when the cursor hovers over a dot.
    hover = plot_lda.select(dict(type=HoverTool))
    hover.tooltips = [("file name", "@file_names"),
                      ("topic summary", '@raw_topic_summaries')]

    t7 = time.time()
    print("Time for Bokeh plotting: " + str(t7-t6))

    print('\n>>> whole process done; took {} mins\n'.format((t7 - t0) / 60.))

    #creates the html code of the visualisation that will be used in the html template.
    html = file_html(plot_lda, CDN)

    #pickles and saves the objects for later use
    raw_topic_summaries_path = "pickles/raw_topic_summaries" + str(myid) + '.p'
    lda_keys_path = "pickles/lda_keys_path" + str(myid) + '.p'

    pickle.dump(raw_topic_summaries, open(raw_topic_summaries_path, "wb"))
    pickle.dump(_lda_keys, open(lda_keys_path, "wb"))

    return html
Esempio n. 9
0
def lda_tsne(total_text, file_names, n_topics=None, n_top_words=None):

    myid = session['myid']

    n_data = len(file_names)

    if n_topics is None:
        n_topics = int(round(((len(file_names)) / 2)**0.5))
        session['number_topics'] = str(n_topics)

    if n_top_words is None:
        n_top_words = 5
        session['number_topwords'] = str(n_top_words)

    t0 = time.time()

    stopwords = stop_word_list()
    cvectorizer = CountVectorizer(min_df=1,
                                  stop_words=stopwords,
                                  lowercase=True,
                                  ngram_range=(1, 3))
    cvz = cvectorizer.fit_transform(total_text)

    t1 = time.time()

    print("Time for count vectorizer (document term matrix): " + str(t1 - t0))

    #lda_model = LatentDirichletAllocation(n_components=n_topics)
    t2 = time.time()
    lda_model = lda.LDA(n_topics, 500)

    X_topics = lda_model.fit_transform(cvz)

    t3 = time.time()

    print("Time for LDA: " + str(t3 - t2))

    # print("NUMBER OF ITERATIONS OF LDA: " + str(lda_model.n_iter_))

    if not os.path.exists('pickles'):
        os.makedirs('pickles')

    lda_model_path = "pickles/lda_model_" + str(myid) + '.p'
    document_term_matrix_path = "pickles/document_term_matrix_" + str(
        myid) + '.p'
    cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p'

    pickle.dump(lda_model, open(lda_model_path, "wb"))
    pickle.dump(cvz, open(document_term_matrix_path, "wb"))
    pickle.dump(cvectorizer, open(cvectorizer_path, "wb"))

    ##############################################################################

    num_example = len(X_topics)

    t4 = time.time()
    # t-SNE: 50 -> 2D
    tsne_model = TSNE(n_components=2,
                      verbose=1,
                      random_state=0,
                      angle=.2,
                      init='pca')
    tsne_lda = tsne_model.fit_transform(X_topics[:num_example])

    t5 = time.time()

    print("Time for TSNE: " + str(t5 - t4))

    tsne_lda_df = pd.DataFrame(tsne_lda)

    print(tsne_lda_df.describe())

    tsne_lda_df = tsne_lda_df.fillna('')

    tsne_lda = tsne_lda[~np.isnan(tsne_lda).any(axis=1)]

    tsne_lda_df = tsne_lda_df[~tsne_lda_df.isin([np.nan, np.inf, -np.inf]).
                              any(1)]

    print(tsne_lda_df.describe())
    # find the most probable topic for each news
    _lda_keys = []
    for i in range(X_topics.shape[0]):
        _lda_keys += X_topics[i].argmax(),

    print("LDA")
    print(_lda_keys)
    # show topics and their top words
    topic_summaries = []
    topic_word = lda_model.components_  # get the topic words
    vocab = cvectorizer.get_feature_names()
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        topic_summaries.append(' '.join(topic_words))

    colormap = np.array([])

    for i in range(n_topics):
        color = "#" + "%06x" % random.randint(0, 0xFFFFFF)
        colormap = np.append(colormap, color)

    raw_topic_summaries = []
    for x in _lda_keys:
        raw_topic_summaries.append(topic_summaries[x])

    # plot

    t6 = time.time()
    title = " t-SNE visualization of LDA model trained on {} files, " \
            "{} topics, {} data " \
            "points and top {} words".format(
        X_topics.shape[0], n_topics, num_example, n_top_words)

    plot_lda = bp.figure(
        plot_width=1200,
        plot_height=800,
        title=title,
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None,
        y_axis_type=None,
        min_border=1)

    if n_data < 30:
        dot_size = 20
    if n_data >= 30 and n_data < 50:
        dot_size = 15
    if n_data >= 50 and n_data < 150:
        dot_size = 11
    if n_data >= 150:
        dot_size = 5

    source = bp.ColumnDataSource(
        data=dict(x=tsne_lda_df.iloc[:, 0],
                  y=tsne_lda_df.iloc[:, 1],
                  color=colormap[_lda_keys][:num_example],
                  file_names=file_names,
                  raw_topic_summaries=raw_topic_summaries))
    plot_lda.scatter(x='x', y='y', color='color', source=source, size=dot_size)

    plot_lda.outline_line_width = 7
    plot_lda.outline_line_alpha = 0.3
    plot_lda.outline_line_color = "#353A40"

    # randomly choose a news (in a topic) coordinate as the crucial words coordinate
    topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
    for topic_num in _lda_keys:
        if not np.isnan(topic_coord).any():
            break
        topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

    # plot crucial words
    for i in range(X_topics.shape[1]):
        plot_lda.text(topic_coord[i, 0], topic_coord[i, 1],
                      [topic_summaries[i]])

    # hover tools
    hover = plot_lda.select(dict(type=HoverTool))
    hover.tooltips = [("file name", "@file_names"),
                      ("topic summary", '@raw_topic_summaries')]

    t7 = time.time()
    print("Time for Bokeh plotting: " + str(t7 - t6))

    print('\n>>> whole process done; took {} mins\n'.format((t7 - t0) / 60.))

    html = file_html(plot_lda, CDN)

    raw_topic_summaries_path = "pickles/raw_topic_summaries" + str(myid) + '.p'
    lda_keys_path = "pickles/lda_keys_path" + str(myid) + '.p'

    pickle.dump(raw_topic_summaries, open(raw_topic_summaries_path, "wb"))
    pickle.dump(_lda_keys, open(lda_keys_path, "wb"))

    return html