Example #1
0
def get_lda_vis(clustering_pipeline):
    """generates topic-term 2D visualization using pyLDAvis

    Parameters
    ----------
    clustering_pipeline : class reference
        The current modeling pipeling
    """

    with st.spinner(
            "Loading visualization... Once ready, save the generated HTML file shown below."
    ):
        ldavis = clustering_pipeline.generate_ldavis()

        # tmp fix for LDAvis error https://stackoverflow.com/questions/47998685/pyldavis-validation-error-on-trying-to-visualize-topics.
        # not able to comment out line 375 of _prepare pyLDAvis    _input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency) in dockerized version
        if ldavis == "This visualization is currently not available.":
            st.warning(ldavis)
            return

        st.markdown(
            "Set view to widescreen or open HTML in new tab for the best experience."
        )

        ldavis_html = pyLDAvis.prepared_data_to_html(ldavis)
        b64 = base64.b64encode(ldavis_html.encode()).decode(
        )  # some strings <-> bytes conversions necessary here
        href = f'<a href="data:text/html;base64,{b64}">Download HTML File</a> \
            (right-click and save as &lt;some_name&gt;.html)'

        st.markdown(href, unsafe_allow_html=True)

        iframe = f'<iframe width="100%" height="900" src="data:text/html;base64,{b64}">The “iframe” tag is not supported by your browser.</iframe>'

        st.write(iframe, unsafe_allow_html=True)
Example #2
0
 def pipeline(self, model, corpus, dictionary, K):
     viz = pyLDAvis.gensim.prepare(model, corpus, dictionary)
     html = pyLDAvis.prepared_data_to_html(viz, template_type="general")
     self.viz = viz
     self.html = html
     self.K = K
     self.save_html()
Example #3
0
def get_topic_by_lda(dictionary_list,
                     number_topics=5,
                     ldavis_url=None,
                     ldavis_css_url=None):
    dictionary = corpora.Dictionary(dictionary_list)
    dictionary.filter_extremes(no_below=0, no_above=1.0)
    corpus = [dictionary.doc2bow(text) for text in dictionary_list]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=number_topics,
                                               id2word=dictionary,
                                               passes=20)
    coherence_model_object = CoherenceModel(model=ldamodel,
                                            corpus=corpus,
                                            texts=dictionary_list,
                                            dictionary=dictionary,
                                            coherence='c_v')
    coherence_score = coherence_model_object.get_coherence()
    topic_list = ldamodel.show_topics(num_topics=number_topics,
                                      num_words=30,
                                      formatted=False)
    data_prepared_object = pyLDAvis.gensim.prepare(ldamodel,
                                                   corpus,
                                                   dictionary,
                                                   n_jobs=1)
    formatted_html = pyLDAvis.prepared_data_to_html(
        data_prepared_object,
        ldavis_url=ldavis_url,
        ldavis_css_url=ldavis_css_url)
    return formatted_html, topic_list, coherence_score
Example #4
0
def lda_model(data, corpus, dictionary, num_topics):
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

    # Visualize topics with pyLDAvis
    lda_data = gensimvis.prepare(lda, corpus, dictionary)
    html_string = pyLDAvis.prepared_data_to_html(lda_data)
    components.v1.html(html_string, width=1280, height=1024)

    # Visualize documents w/ t-SNE
    visualize_topics(data, corpus, lda, num_topics)
Example #5
0
 def __render_model(self, model, corpus, dict, ntopics):
     data = pyLDAvis.gensim.prepare(model, corpus, dict)
     div_id = "pyldavis"
     html = pyLDAvis.prepared_data_to_html(data,
                                           template_type="simple",
                                           visid=div_id)
     found = '!function(LDAvis){' + re.search(
         r"\!function\(LDAvis\)\{(.+?)\}\(LDAvis\)\;", html,
         re.MULTILINE | re.DOTALL).group(1) + '}(LDAvis);'
     #print("Found->",found)
     return found
Example #6
0
def topicmodel_forproyect(id_proyect):

    df_comments = get_data(id_proyect)
    #list_mask=np.unique(df_comments.project_id)

    #mask = df_comments["project_id"] == id_proyect

    #df2 = pd.read_excel("datos_congresista_virtual.xlsx", sheet_name="clasificaciones")
    num_topics = 5

    df2 = df_comments.body
    df2 = df2.str.lower()
    pattern = r"@([A-Za-z0-9_]+)"
    df2 = df2.str.replace(pattern, '')

    elements = np.array(df2.tolist())
    tokenizer = RegexpTokenizer(r'\w+')
    es_stop = get_stop_words('es')
    p_stemmer = PorterStemmer()
    texts = []
    print(str(id_proyect))
    for i in elements:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in es_stop]
        #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stopped_tokens)
        #texts.append(stemmed_tokens)
        print(i)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20)
    #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, distributed=True, passes=20)
    try:
        ldamodel = gensim.models.ldamulticore.LdaMulticore(
            corpus, num_topics=num_topics, id2word=dictionary, passes=20)
    except ValueError:
        return "Coleccion Vacia. Aparentemente parametros faltantes o mal ingresados."

    import pyLDAvis.gensim
    import pyLDAvis

    vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    pyLDAvis.display(vis_data)

    return pyLDAvis.prepared_data_to_html(vis_data)
Example #7
0
def corp_eval(dictionary, tokens, corpus, q_count, num_of_topics):
    """Evaluate the corpus and produce gensim visualization."""
    i = len(tokens)
    lda = gensim.models.ldamodel.LdaModel(corpus,
                                          id2word=dictionary,
                                          num_topics=num_of_topics,
                                          passes=1,
                                          alpha='symmetric',
                                          eta=None)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    logging.debug(dictionary.token2id)
    logging.debug(viewitems(dictionary.dfs))

    print(Fore.GREEN + "Producing LDA analysis for question: ", q_count,
          Style.RESET_ALL)
    print(lda)
    vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    print(Fore.YELLOW + "These are the current topics: " + Style.RESET_ALL)
    print(lda.print_topics(i))
    print(
        Fore.CYAN + "Opening up visualization in a new tab in the browser...",
        Style.RESET_ALL)

    # Writing HTML of visualization to file instead of showing with pyLDAvis show function
    # because the show function starts a server, which allows only one file to be displayed
    # at once.
    vis_html_text = pyLDAvis.prepared_data_to_html(vis)
    vis_html_file_name = "vis" + str(q_count) + ".html"
    vis_html_file = open(vis_html_file_name, "w")
    vis_html_file.write(vis_html_text)

    # Getting path to the edurate_gensim.py module, which is in the same directory
    # as the HTML file. This path will be used to generate the file path to the HTML
    # that is to be displayed.
    MODULE_NAME = "edurate_gensim.py"
    PATH_TO_MODULE = inspect.stack()[0][1]
    # Removing name of module from path so that the path only includes up to the
    # directory where the HTML file is located.
    PATH_TO_HTML = PATH_TO_MODULE[:-len(MODULE_NAME)]
    webbrowser.open("file:///" + PATH_TO_HTML + vis_html_file_name, new=2)

    logging.info("Gensim visualization has been displayed.")
    return dictionary.dfs
Example #8
0
def pyldavis_run(lda_model_path, document_term_matrix_path, vectorizer_path):
    '''
    Computes the pyLDAvis visualisation of the LDA model.

    Parameters
    ----------
    lda_model_ath : str
        Path of the  pickle object (serialised python object) of the LDA model. This is created in the lda_tsne_model2.py module.
    document_term_matrix_path : str
        Path of the  pickle object (serialised python object) of the document-term matrix which is created using the CountVectorizer in the lda_tsne_model2.py module.
    vectorizer_path : str 
        Path of the  pickle object (serialised python object) of the vectorizer used to create the document-term matrix.This is usually the CountVectorizer in the lda_tsne_model2.py module.

    Returns
    ----------
    Embedded html pyldavis visulisation of the LDA model.
    '''

    t0 = time.time()

    # loading the pickle objects from the paths parameters.
    lda_model = pickle.load(open(lda_model_path, "rb"))
    document_term_matrix = pickle.load(open(document_term_matrix_path, "rb"))
    cvectorizer = pickle.load(open(vectorizer_path, "rb"))

    #prepares the pyldavis visualisation. There is a choice of dimensionality reduction methods here, TSNE is chosen as it is consistent
    #with the previous analysis in the lda_tsne_model2.py module and has shown to yield better results than other available methods.
    prepared_data = prepare(lda_model,
                            document_term_matrix,
                            cvectorizer,
                            mds='tsne',
                            plot_opts={
                                'xlab': '',
                                'ylab': ''
                            })

    html = pyLDAvis.prepared_data_to_html(prepared_data)

    t1 = time.time()
    print("time for pyldavis: " + str(t1 - t0), file=sys.stdout)

    return html
Example #9
0
def get_topic_by_lda(dictionary_list,
                     number_topics=5,
                     ldavis_url=None,
                     ldavis_css_url=None):
    dictionary = corpora.Dictionary(dictionary_list)
    dictionary.filter_extremes(no_below=0, no_above=1.0)
    corpus = [dictionary.doc2bow(text) for text in dictionary_list]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=number_topics,
                                               id2word=dictionary,
                                               passes=20)
    data_prepared_object = pyLDAvis.gensim.prepare(ldamodel,
                                                   corpus,
                                                   dictionary,
                                                   n_jobs=1)
    formatted_html = pyLDAvis.prepared_data_to_html(
        data_prepared_object,
        ldavis_url=ldavis_url,
        ldavis_css_url=ldavis_css_url)
    return formatted_html
Example #10
0
    def ldavis(self, input_files, param, tool_id):

        data_to_return = {"data": {}}
        ok_to_process = False

        # Check the tool needs
        # -----
        if "d-model-corpus" in input_files and "d-dictionary-corpus" in input_files and "d-gensimldamodel" in input_files:
            ok_to_process = len(input_files["d-model-corpus"]) and len(
                input_files["d-dictionary-corpus"]) and len(
                    input_files["d-gensimldamodel"])

        if not ok_to_process:
            res_err = {"data": {}}
            res_err["data"]["error"] = "Input data missing!"
            return res_err

        corpus = []
        for file_k in input_files["d-model-corpus"]:
            for d in input_files["d-model-corpus"][file_k]:
                corpus.append(d["value"])

        dictionary = None
        for file_k in input_files["d-dictionary-corpus"]:
            dictionary = input_files["d-dictionary-corpus"][file_k]

        ldamodel = None
        for file_k in input_files["d-gensimldamodel"]:
            ldamodel = input_files["d-gensimldamodel"][file_k]

        # Params
        # -----
        # NO PARAMS
        vis = pyLDAvis.gensim.prepare(ldamodel,
                                      corpus,
                                      dictionary,
                                      sort_topics=False)
        html_str = pyLDAvis.prepared_data_to_html(vis)
        data_to_return["data"]["d-ldavis-html"] = {"ldavis": html_str}
        return data_to_return
Example #11
0
def main():
    hyperparameters = get_hyperparameters()
    if len(sys.argv) > 1:
        args = vars(utils.parse_args())
        args = {k: v for k, v in args.items() if v is not None}
        hyperparameters.update(args)

    wandb.init(project="bom-topic-modelling", config=hyperparameters)

    lm, corpus, dictionary = train(**hyperparameters)

    lm.save(os.path.join(wandb.run.dir, 'lda.model'))

    # topic difference heatmap
    mdiff, _ = lm.diff(lm, distance='jaccard', num_words=50)
    fig = px.imshow(mdiff, origin='lower', color_continuous_scale='RdBu_r')
    wandb.log({"topic_diff": fig})

    # pyLDAvis
    vis = pyLDAvis.gensim.prepare(lm, corpus, dictionary)
    html = pyLDAvis.prepared_data_to_html(vis)
    wandb.log({"pyLDAvis": wandb.Html(html, inject=False)})
Example #12
0
def topic_modelling(data):
    abstracts = []
    for abstract in data:
        # Remove punctuation
        abstract = re.sub('[,\.!?]', '', abstract)
        # Remove numbers
        abstract = re.sub('[0-9]', '', abstract)
        # Convert the abstracts to lowercase
        abstract = abstract.lower()
        abstracts.append(abstract)
    # Splitting abstracts
    snnipets = []
    for abstract in abstracts:
        if abstract != "abstract not available":
            length = len(abstract)
            index = 0
            last_i = 0
            n = 256
            while index < length:
                i = abstract.rfind(". ", index, index + n)
                if i == -1 or i == index:
                    i = index + n
                text = abstract[index:i + 2]
                index = i + 2
                snnipets.append(text)
    # Creating LDA
    #number_topics = 5
    tf_vectorizer = CountVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(snnipets)
    lda_tfidf = LDA(random_state=0)
    lda_tfidf.fit(dtm_tfidf)
    # Visualizing LDA
    data = pyLDAvis.sklearn.prepare(lda_tfidf,
                                    dtm_tfidf,
                                    tfidf_vectorizer,
                                    mds='mmds')
    html = pyLDAvis.prepared_data_to_html(data, template_type="simple")
    return html
Example #13
0
def show_vis(vis):
    # Writing HTML of visualization to file instead of showing with pyLDAvis show function
    # because the show function starts a server, which allows only one file to be displayed
    # at once.
    print(
        Fore.CYAN + "Opening up visualization in a new tab in the browser...",
        Style.RESET_ALL)
    vis_html_text = pyLDAvis.prepared_data_to_html(vis)
    vis_html_file_name = defaults.GENSIM_OUTPUT_FILENAME
    vis_html_file = open(vis_html_file_name, "a")
    vis_html_file.write(vis_html_text)

    # Getting path to the refl_gensim.py module, which is in the same directory
    # as the HTML file. This path will be used to generate the file path to the HTML
    # that is to be displayed.
    MODULE_NAME = "refl_gensim.py"
    PATH_TO_MODULE = inspect.stack()[0][1]
    # Removing name of module from path so that the path only includes up to the
    # directory where the HTML file is located.
    PATH_TO_HTML = PATH_TO_MODULE[:-len(MODULE_NAME)]
    webbrowser.open("file:///" + PATH_TO_HTML + "e/" + vis_html_file_name,
                    new=2)
    logging.info("Gensim visualization has been displayed.")
    return
Example #14
0
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None,
         coherence='u_mass', vis_time=0, seed=None):
    running_os = platform.system()
    is_os_64bit = platform.machine().endswith('64')
    if running_os == 'Linux':
        if is_os_64bit:
            dtm_filename = 'dtm-linux64'
        else:
            dtm_filename = 'dtm-linux32'
    elif running_os == 'Windows':
        if is_os_64bit:
            dtm_filename = 'dtm-win64.exe'
        else:
            dtm_filename = 'dtm-win32.exe'
    else:  # Mac
        dtm_filename = 'dtm-darwin64'
    dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename)
    if running_os != 'Windows':
        bash_command = "chmod +x {}".format(dtm_path)
        os.system(bash_command)
    tokenized_doc = np.array(table[input_col])
    num_doc = len(tokenized_doc)
    if time_slice is None:
        time_slice = [num_doc]
    elif sum(time_slice) != num_doc:
        raise_runtime_error("The sum of time slice list does not match the number of documents.")
    if vis_time < 0 or vis_time >= len(time_slice):
        raise_runtime_error("Invalid time parameter: {}".format(vis_time))
    dictionary = corpora.Dictionary(tokenized_doc)
    corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
    dtm_params = {"corpus": corpus,
                  "id2word": dictionary,
                  "time_slices": time_slice,
                  "num_topics": num_topic,
                  "lda_sequence_max_iter": max_iter,
                  "model": 'dtm'}
    if seed is not None:
        dtm_params["rng_seed"] = seed
    dtm_model = DtmModel(dtm_path, **dtm_params)

    topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)]
                  for t in range(len(time_slice))]
    topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time]
    timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)]
    columns = ["topic_{}".format(i + 1) for i in range(num_topic)]
    topic_table = pd.DataFrame(topic_time, columns=columns)
    topic_table['time'] = timeline
    topic_table = topic_table[['time'] + columns]

    prop_arr = dtm_model.gamma_
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors(
            [{'0100': "Existing table contains Topic Column Name. Please choose again."}])
    out_table[topic_name] = [item.argmax() + 1 for item in prop_arr]
    out_table['topic_distribution'] = prop_arr.tolist()

    coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))]
    if coherence == 'u_mass':
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence()
                   for item in coherence_topic_arr]
    else:
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc,
                                  coherence='c_v').get_coherence() for item in coherence_topic_arr]

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time)
    prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False)
    html_result = plv.prepared_data_to_html(prepared_data)

    params = {'Input column': input_col,
              'Topic column name': topic_name,
              'Number of topics': num_topic,
              'Number of words for each topic': num_topic_word,
              'Maximum number of iterations': max_iter,
              'Time slice': time_slice,
              'Coherence measure': coherence,
              'Time to visualize': vis_time}
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Dynamic Topic Modeling Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    | ### Coherence for each period
    | {coh_arr}
    |
    | ### Parameters
    | {params}
    """.format(coh_arr=coh_arr, params=dict2MD(params))))

    model = _model_dict('dtm_model')
    model['params'] = params
    model['dtm_model'] = dtm_model
    model['coherences'] = coh_arr
    model['corpus'] = corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Example #15
0
def _gsdmm(table,
           input_col,
           topic_name='topic',
           K=10,
           alpha=0.1,
           beta=0.1,
           max_iter=50,
           num_topic_words=3):
    docs = np.array(table[input_col])
    docs_set = [set(doc) for doc in docs]
    docs_preprocessed = [list(doc_set) for doc_set in docs_set]
    vocab_set = list(set.union(*docs_set))
    vocab_size = len(vocab_set)

    # initialize and train a GSDMM model
    mgp = gsdmm_rwalk.MovieGroupProcess(K=K,
                                        alpha=alpha,
                                        beta=beta,
                                        n_iters=max_iter)
    topics = mgp.fit(docs_preprocessed, vocab_size)

    # generate topic table
    topic_word_count = mgp.cluster_word_distribution
    topic_words_raw = [[ind, _count_to_ratio_raw(word_count)]
                       for ind, word_count in enumerate(topic_word_count)
                       if word_count]
    topic_words = [[item[0]] + _gen_table(item[1], num_topic_words)
                   for item in topic_words_raw]

    # reset topic ids
    nonempty_topic_indices = [item[0] for item in topic_words]
    reset_topic_ind = {
        old_ind: (new_ind + 1)
        for new_ind, old_ind in enumerate(nonempty_topic_indices)
    }
    topics = [reset_topic_ind[old_ind] for old_ind in topics]
    topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:]
                   for old_item in topic_words]

    # generate output dataframes
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains the topic column name. Please choose another name."
        }])
    out_table[topic_name] = topics
    columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights']
    topic_table = pd.DataFrame(topic_words, columns=columns)
    topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric)

    # pyLDAvis
    if len(topic_words) == 1:
        html_result = None
    else:
        topic_words_dicts = [item[1] for item in topic_words_raw]
        topic_term_dists = [[
            topic_words_dict.get(word, 0) for word in vocab_set
        ] for topic_words_dict in topic_words_dicts]
        num_docs = len(topics)
        num_topics = len(topic_words_raw)
        doc_topic_dists = np.zeros((num_docs, num_topics))
        for doc_id, topic_id in enumerate(topics):
            doc_topic_dists[doc_id][topic_id - 1] = 1.0
        doc_lengths = [len(doc) for doc in docs_preprocessed]
        vocab_count = functools.reduce(
            lambda dict_1, dict_2: {
                word: dict_1.get(word, 0) + dict_2.get(word, 0)
                for word in set(dict_1).union(dict_2)
            }, topic_word_count)
        term_frequency = [vocab_count.get(word) for word in vocab_set]

        prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists,
                                         doc_lengths, vocab_set,
                                         term_frequency)
        html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'K': K,
        'Alpha': alpha,
        'Beta': beta,
        'Maximum number of iterations': max_iter,
        'Number of words for each topic': num_topic_words
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## GSDMM Result
    | ### Summary
    |
    """))
    if html_result is not None:
        rb.addHTML(html_result)
        rb.addMD(strip_margin("""
        |
        """))
    rb.addMD(
        strip_margin("""
    | ### Final Number of Topics
    | {num_topics}
    |
    | ### Parameters
    | {params}
    """.format(num_topics=len(topic_words_raw), params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['gsdmm_model'] = mgp
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Example #16
0
 def visualise_lda(self,lda_model,corpus,dct):
     vis = pyLDAvis.gensim.prepare(lda_model, corpus, dct)
     lda_html=pyLDAvis.prepared_data_to_html(vis)
     #data_path="/Users/ankitanand/Box/UB/Fall 2019/IR/Proj1/cooked/lda.html"
     #lda_html=pyLDAvis.save_html(vis,data_path)
     return lda_html
Example #17
0
def buildsklearnselectedworks(so: SearchObject, bagsofsentences: list):
    """
    see:
        http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

    see also:

        https://nlpforhackers.io/topic-modeling/

    CountVectorizer:
    max_df : float in range [0.0, 1.0] or int, default=1.0
        When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.


    see:
        https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151

    max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

        max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
        max_df = 25 means "ignore terms that appear in more than 25 documents".

    The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

    min_df is used for removing terms that appear too infrequently. For example:

        min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
        min_df = 5 means "ignore terms that appear in less than 5 documents".

    The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

    notes:
        maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc.
        maxfreq of

    on the general issue of graphing see also:
        https://speakerdeck.com/bmabey/visualizing-topic-models
        https://de.dariah.eu/tatom/topic_model_visualization.html

    on the axes:
        https://stats.stackexchange.com/questions/222/what-are-principal-component-scores

    """

    activepoll = so.poll
    vv = so.vectorvalues

    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    activepoll.statusis('Running the LDA vectorizer')
    # Use tf (raw term count) features for LDA.
    ldavectorizer = CountVectorizer(max_df=settings['maxfreq'],
                                    min_df=settings['minfreq'],
                                    max_features=settings['maxfeatures'])

    ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

    ldamodel = LatentDirichletAllocation(n_components=settings['components'],
                                         max_iter=settings['iterations'],
                                         learning_method='online',
                                         learning_offset=50.,
                                         random_state=0)

    ldamodel.fit(ldavectorized)

    visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer)
    # pyLDAvis.save_html(visualisation, 'ldavis.html')

    ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation)
    storevectorindatabase(so, ldavishtmlandjs)

    return ldavishtmlandjs
Example #18
0
def defectsClustering(username, password, repo):

    g = Github(username, password)
    user = g.get_user()
    repository = g.get_repo(repo)
    repoName = repo

    Issues = repository.get_issues()
    commitIssues = []

    try:
        from collections.abc import Callable  # noqa
    except ImportError:
        from collections import Callable  # noqa

    print("I am here")
    # Import Dataset
    df_issues = pd.read_csv('D:/CDAP/g-Codex/dataset.csv')
    #print(df.target_names.unique())
    print(df_issues)
    # df.head()
    #df_issues=pd.DataFrame(commitIssues)

    #Get all to lowercase
    #df_issues = df_issues.apply(lambda x: x.lower())

    #Remove punctuations
    df_issues.Issue = df_issues.Issue.apply(
        lambda x: x.translate(string.punctuation))
    #print("step 1")
    # Convert to list
    data = df_issues.Issue.values.tolist()

    #Sent to word
    def sent_to_words(sentences):
        for sentence in sentences:
            yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))

    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]

    data_words = list(sent_to_words(data))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(
        data_words, min_count=5,
        threshold=100)  # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # See trigram example
    # print(trigram_mod[bigram_mod[data_words[0]]])
    print("step 2")

    # Define functions for stopwords, bigrams, trigrams and lemmatization
    def remove_stopwords(texts):
        return [[
            word for word in simple_preprocess(str(doc))
            if word not in stop_words
        ] for doc in texts]

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    # def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    #     """https://spacy.io/api/annotation"""
    #     texts_out = []
    #     for sent in texts:
    #         doc = nlp(" ".join(sent))
    #         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    #     return texts_out

# Remove Stop Words
    data_words_nostops = remove_stopwords(df_issues.Issue)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Create Dictionary
    id2word = corpora.Dictionary(data_words_bigrams)

    # Create Corpus
    texts = data_words_bigrams

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # View

    id2word[0]

    # Human readable format of corpus (term-frequency)
    [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

    #Testing for four topics
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=4,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_words_bigrams,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]

    for idx, topic in lda_model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))

# Compute Perplexity
    print('\nPerplexity: ', lda_model.log_perplexity(
        corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_words_bigrams,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score for for five topics: ', coherence_lda)

    # Visualize the topics
    #pyLDAvis.enable_notebook()

    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    # pyLDAvis.show(vis, '192.168.8.100', port=8888, n_retries=5, local=False, open_browser=True, http_server= None)
    #pyLDAvis.save_html(vis,'kush.html')
    print('hi')
    ad = pyLDAvis.prepared_data_to_html(vis, template_type="general")
    print(ad)
    return ad
def model_page():
    vis_html = pyLDAvis.prepared_data_to_html(vis)
    return render_template('model.html', vis_html=vis_html)
Example #20
0
def get_lda():
    tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # create sample documents
    doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
    doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
    doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
    doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
    doc_e = "Health professionals say that brocolli is good for your health."

    doc_set2 = [i for i in range(1,10)]
    print(doc_set2)

    # compile sample documents into a list
    doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

    # list for tokenized documents in loop
    texts = []

    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tagged_tokens = nltk.pos_tag(tokens)
        print(tagged_tokens)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]

        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

        # add tokens to list
        texts.append(stemmed_tokens)
        print (tokens)
        print (stemmed_tokens)
        print ("--------------------------------")
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

    print (lda.show_topics())



    import matplotlib
    matplotlib.use('qt5agg')

    import pyLDAvis.gensim as gensimvis
    import pyLDAvis

    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    x = pyLDAvis.prepared_data_to_html(vis_data)
    #print (x)



    return x


    print("-------------------")

    '''
Example #21
0
def hdp_model(corpus, dictionary):
    hdp = models.HdpModel(corpus, id2word=dictionary)
    hdp_data = gensimvis.prepare(hdp, corpus, dictionary)
    html_string = pyLDAvis.prepared_data_to_html(hdp_data)
    components.v1.html(html_string, width=1280, height=1024)
Example #22
0
# generate TF-IDF, LDA model
from gensim import models
tfidf_model = models.TfidfModel(corpus)
tfidf = tfidf_model[corpus]
print("\n","=========== TF-IDF ============")
# print first 10 elements of first document's tf-idf vector
print("\n",tfidf.corpus[0][:10])
# print top 10 elements of first document's tf-idf vector
print("\n",sorted(tfidf.corpus[0], key=lambda x: x[1], reverse=True)[:10])
# print token of most frequent element
#print("\n",dictionary.get(13))

n_topics = 5
lda = models.ldamodel.LdaModel(tfidf, num_topics=n_topics, id2word=dictionary, passes=1)
print("\n","=========== lda.show_topics() ============")
#print(lda.show_topics())
print(lda.print_topics(num_topics=n_topics, num_words=10))

import matplotlib
matplotlib.use('qt5agg')

import pyLDAvis.gensim as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(lda, corpus, dictionary)
x = pyLDAvis.prepared_data_to_html(vis_data)
print (x)


def ldatopicgraphing(sentencetuples,
                     workssearched,
                     searchobject,
                     headwordstops=True):
    """

	a sentence tuple looks like:
		('gr2397w001_ln_42', 'ποίῳ δὴ τούτων ἄξιον τὸν κόϲμον φθείρεϲθαι φάναι')

	see:
		http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

	see also:

		https://nlpforhackers.io/topic-modeling/

	CountVectorizer:
	max_df : float in range [0.0, 1.0] or int, default=1.0
	    When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

	min_df : float in range [0.0, 1.0] or int, default=1
	    When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.


	see:
		https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151

	max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

		max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
		max_df = 25 means "ignore terms that appear in more than 25 documents".

	The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

	min_df is used for removing terms that appear too infrequently. For example:

		min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
		min_df = 5 means "ignore terms that appear in less than 5 documents".

	The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

	notes:
		maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc.
		maxfreq of

	on the general issue of graphing see also:
		https://speakerdeck.com/bmabey/visualizing-topic-models
		https://de.dariah.eu/tatom/topic_model_visualization.html

	on the axes:
		https://stats.stackexchange.com/questions/222/what-are-principal-component-scores

	:param sentencetuples:
	:param activepoll:
	:return:
	"""

    if headwordstops:
        stops = mostcommonwordsviaheadwords()
    else:
        stops = mostcommoninflectedforms()

    sentencetuples = [(a, removestopwords(b, stops))
                      for a, b in sentencetuples]

    activepoll = searchobject.poll
    vv = searchobject.vectorvalues

    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    # not easy to store/fetch since you need both ldavectorizer and ldamodel
    # so we just store the actual graph...
    ldavishtmlandjs = checkforstoredvector(searchobject, 'lda')

    if not ldavishtmlandjs:
        sentencetuples = [
            s for s in sentencetuples
            if len(s[1].strip().split(' ')) > settings['mustbelongerthan']
        ]
        sentences = [s[1] for s in sentencetuples]

        sentencesaslists = [s.split(' ') for s in sentences]
        allwordsinorder = [
            item for sublist in sentencesaslists for item in sublist if item
        ]

        activepoll.statusis('Finding all headwords')
        morphdict = getrequiredmorphobjects(set(allwordsinorder),
                                            furtherdeabbreviate=True)
        morphdict = convertmophdicttodict(morphdict)

        bagsofwordlists = buildwordbags(searchobject, morphdict,
                                        sentencesaslists)
        bagsofsentences = [' '.join(b) for b in bagsofwordlists]

        # print('bagsofsentences[:3]', bagsofsentences[3:])

        activepoll.statusis('Running the LDA vectorizer')
        # Use tf (raw term count) features for LDA.
        ldavectorizer = CountVectorizer(max_df=settings['maxfreq'],
                                        min_df=settings['minfreq'],
                                        max_features=settings['maxfeatures'])

        ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

        ldamodel = LatentDirichletAllocation(
            n_components=settings['components'],
            max_iter=settings['iterations'],
            learning_method='online',
            learning_offset=50.,
            random_state=0)

        ldamodel.fit(ldavectorized)

        visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer)
        # pyLDAvis.save_html(visualisation, 'ldavis.html')

        ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation)
        storevectorindatabase(searchobject, 'lda', ldavishtmlandjs)

    jsonoutput = ldatopicsgenerateoutput(ldavishtmlandjs, searchobject)

    return jsonoutput
Example #24
0
year=2016
topic_num=10

argc=len(sys.argv)
if(argc>1):
    year=int(sys.argv[1])
if(argc>2):
    topic_num=int(sys.argv[2])

if(argc>3):
    conference=sys.argv[3]

relpath= conference+str(year)
print  conference,year, topic_num

fname=relpath+'/papers'
outfname=relpath+'papers'

dictionary = gensim.corpora.Dictionary.load(fname+'.dict')
corpus = gensim.corpora.MmCorpus(fname+'.mm')
lda = gensim.models.ldamodel.LdaModel.load(fname+'_%d.model'%topic_num)

pdata=pyLDAvis.gensim.prepare(lda, corpus, dictionary)
p=pyLDAvis.prepared_data_to_html(pdata)

with open(outfname+"_%d.html"%(topic_num),"w") as fp:
    print >>fp,"<h1> %s %d</h1>"%(conference.upper(),year)
    print >>fp,"topic num=%d"%topic_num
    print >>fp,p
Example #25
0
def display_with_header(data, header):
    hdata = pyLDAvis.prepared_data_to_html(data)
    hheader = '<h1>%s</h1>' % (header)
    display(HTML(hheader+hdata))
Example #26
0
def main(filename):
    global data_vectorized
    global lda_output
    global plot_df
    df = pd.read_csv(UPLOAD_FOLDER + '/' + filename)  # CHANGE THIS
    df = df.sample(frac=0.2, replace=False, random_state=1)
    N_NGRAM_RANGE = 2  # CHANGE HERE
    my_additional_stop_words = pd.read_csv(
        r'C:\Users\noel.alexander\Documents\Fullstack\Topic Modelling\Stopwords\custom_stopwords.csv'
    ).values.flatten().tolist()  #CHANGE THIS
    stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
    data = df.content.values.tolist()

    # Remove Emails
    data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub(r'\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only Noun, Adj, Verb, Adverb
    data_lemmatized = lemmatization(
        n=nlp,
        texts=data_words,
        allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    vectorizer = CountVectorizer(
        analyzer='word',
        min_df=0.05,  # ignore terms that appear in less than 5% of the documents
        stop_words=stop_words,  # remove stop words
        lowercase=True,  # convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
        ngram_range=(1, N_NGRAM_RANGE))

    data_vectorized = vectorizer.fit_transform(data_lemmatized)
    space = {
        'n_topics': hp.quniform("n_topics", 6, 10,
                                1),  # search n_topics from 2-20
        'learning_decay':
        hp.uniform('learning_decay', 0.5,
                   0.9),  # search learning_decay from 0.5-0.9
    }

    trials = Trials()

    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=25,
                trials=trials)

    LEARNING_DECAY = best['learning_decay']  #0.84529 #best['learning_decay']
    N_TOPICS = best['n_topics']  #9 #best['n_topics']
    print('starting lda')
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=int(N_TOPICS),  # number of topics
        learning_decay=
        LEARNING_DECAY,  # control learning rate in the online learning method
        max_iter=10,  # max learning iterations
        learning_method='online',  # use mini-batch of training data
        batch_size=128,  # n docs in each learning iter
        n_jobs=-1,  # use all available CPUs
    )

    lda_output = lda_model.fit_transform(data_vectorized)
    lda_output = lda_model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2),
                                     columns=topicnames,
                                     index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(
        color_green).applymap(make_bold)
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts(
    ).reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']
    df_topic_distribution['Percent of Total'] = round(
        df_topic_distribution['Num Documents'] /
        np.sum(df_topic_distribution['Num Documents'].values), 2)
    topic_keywords = show_topics(vectorizer=vectorizer,
                                 lda_model=lda_model,
                                 n_words=15)

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = [
        'Word ' + str(i) for i in range(df_topic_keywords.shape[1])
    ]
    df_topic_keywords.index = [
        'Topic ' + str(i) for i in range(df_topic_keywords.shape[0])
    ]
    #pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(lda_model,
                                     data_vectorized,
                                     vectorizer,
                                     mds='tsne')
    topics_dic = {}
    for i in range(int(N_TOPICS)):
        topics_dic[i] = 'topic ' + str(i)
    plot_df = pd.DataFrame({'topics': labels})
    plot_df['topics'] = plot_df['topics'].map(topics_dic)
    labels = []
    for doc in lda_output:
        labels.append(np.argmax(doc))
    labels = np.array(labels)

    embedding = umap.UMAP(n_neighbors=100,
                          min_dist=0.9).fit_transform(lda_output)

    plot_df['axis_1'] = embedding[:, 0]
    plot_df['axis_2'] = embedding[:, 1]

    html = pyLDAvis.prepared_data_to_html(panel)

    return html
Example #27
0
def model_function():
    #***********************************************************************
    #                Defining the different news categories

    categories = {
        'Sports': [
            'football', 'ball', 'team', 'play', 'win', 'season', 'fan', 'run',
            'scoore', 'athletics', 'spectator', 'competition', 'tennis',
            'yard', 'game', 'fun', 'cricket', 'stadium', 'uefa', 'concacaf',
            'player', 'game', 'referee'
        ],
        'Medical': [
            'patient', 'study', 'slave', 'food', 'eat', 'pain', 'treatment',
            'syndrome', 'therapy', 'medicine', 'health', 'doctor', 'diagnosis',
            'clinical', 'biomedical'
        ],
        'World News': [
            'israel', 'war', 'kill', 'soldier', 'attack', 'war', 'government',
            'racism', 'internet', 'newpaper', 'journalism', 'telephone',
            'earth', 'country', 'conflict', 'civil', 'military', 'peace',
            'war', 'hurt', 'army'
        ],
        'Religion': [
            'god', 'evidence', 'christian', 'believe', 'reason', 'faith',
            'exist', 'bible', 'religion', 'judaism', 'cult', 'belief',
            'theology', 'church', 'symbol', 'homosexuality', 'hell'
        ],
        'Lifestyle': [
            'trending', 'fashion', 'entertainment', 'society', 'person',
            'mode', 'lifestyles', 'casual', 'healthy', 'chic', 'cosmopolitan',
            'popular', 'social', 'fashionable', 'celebrity', 'carpet', 'red',
            'body', 'dress', 'business', 'workplace', 'fun', 'holiday', 'buy',
            'living', 'hobbies', 'hipster'
        ],
        'Culture': [
            'education', 'knowledge', 'learn', 'learning', 'literacy',
            'urbanity', 'class', 'civility', 'ignorance', 'civilization',
            'life', 'values', 'legacy', 'tradition', 'society', 'philosophy',
            'religion', 'nationalism', 'art', 'music', 'ritual', 'concept',
            'humanism', 'classical'
        ],
        'Politics': [
            'government', 'diplomatic', 'law', 'political', 'politics',
            'governance', 'republic', 'state', 'police', 'monarchy',
            'democratic', 'federation', 'city', 'company', 'country', 'latin',
            'uk', 'usa'
        ],
        'Technology': [
            'videogame', 'xbox', 'play', 'station', 'video', 'smartphone',
            'nintendo', 'shooter', 'mobile', 'sony', 'gaming', 'electronics',
            'engineering', 'science', 'robot', 'robotics', 'internet',
            'computer', 'industry', 'automation', 'technological', 'energy',
            'device', 'devices', 'application', 'app', 'technology'
        ],
        'Entertainment': [
            'television', 'film', 'movie', 'animation', 'comedy', 'cinema',
            'media', 'show', 'circus', 'dance', 'concert', 'online', 'radio',
            'party', 'ceremony', 'tourist'
        ],
        'Food': [
            'nutrition', 'rice', 'nutrient', 'beef', 'meat', 'cook', 'cooking',
            'seafood', 'cereal', 'fat', 'soup', 'pasta', 'butter',
            'agriculture', 'meal', 'milk', 'animals', 'chicken', 'plant',
            'energy', 'vegetarian', 'protein', 'vitamin', 'nutriment',
            'aliment', 'fruit', 'vegetable', 'restaurant', 'restaurants',
            'eat', 'kitchen', 'pizza', 'taste'
        ],
    }

    #***********************************************************************
    #                  For eliminating unnecessary words

    add_stop_words = [
        'said', 'would', 'one', 'even', 'really', 'could', 'also'
    ]

    stop_words = stopwords.words('english')

    [stop_words.append(i) for i in add_stop_words]

    stop_words_set = set(stop_words)

    #***********************************************************************
    #                    Preparing the texts

    #***********************************************************************
    #                        Pulling Mongo Data
    myclient = MongoClient("mongodb://{}:5003/".format(mongo_server))
    mydb = myclient["mydatabase"]
    mycol = mydb["prueba"]

    res = mycol.find({}, {"Text": 1, "Title": 1, "Link": 1, "Time": 1})
    res_data_frame = pd.DataFrame(list(res))

    textos = res_data_frame["Text"]
    names = res_data_frame["Title"]
    urls = res_data_frame["Link"]
    time = res_data_frame["Time"]

    texts = []
    documents = []
    for t in textos:
        string = ''.join(t.splitlines())
        string = string.lower()
        word_tokens = word_tokenize(string)
        filtered_sentence = [w for w in word_tokens if not w in stop_words_set]
        documents.append(" ".join(filtered_sentence))
        texts.append(filtered_sentence)

    #***********************************************************************
    #                Model training and graph representation

    tokenized_list = [simple_preprocess(doc) for doc in documents]
    mydict = corpora.Dictionary()
    mycorpus = [
        mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list
    ]
    word_counts = [[(mydict[id], count) for id, count in line]
                   for line in mycorpus]
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=mycorpus,
        id2word=corpora.Dictionary(tokenized_list),
        num_topics=len(categories),
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=30,
        alpha='auto',
        per_word_topics=True)

    vis = pyLDAvis.gensim.prepare(lda_model,
                                  mycorpus,
                                  corpora.Dictionary(tokenized_list),
                                  n_jobs=2)
    graphhtml = pyLDAvis.prepared_data_to_html(vis)

    table_data = []
    for topic in lda_model.print_topics():
        topic_data = list(topic)
        cate = get_cat(topic, categories)
        table_data.append(topic_data)
        topic_data.append(cate)

    #********************************************************************
    #              DataFrame that tells the categories,
    #           their ID's and the words that create them

    df_categories = pd.DataFrame(table_data,
                                 columns=['ID', 'Words', 'Categorie'])
    #********************************************************************

    get_document_topics = lda_model.get_document_topics(mycorpus)

    news_classification = []
    for n in range(len(get_document_topics)):
        for i in range(len(table_data)):
            if get_document_topics[n][0][0] == table_data[i][0]:
                news_classification.append([
                    get_document_topics[n][0][1], table_data[i][2], names[n],
                    urls[n], time[n]
                ])

    #********************************************************************
    #  DataFrame that tells the categories for each article and its url

    df_classification = pd.DataFrame(
        news_classification,
        columns=['Belonging', 'Classification', 'Title', 'Link', 'Time'])
    df_classification = df_classification.sort_values(by="Time",
                                                      ascending=True)
    df_classification["Time"] = df_classification["Time"].apply(
        lambda x: x.ctime())

    return [df_categories, df_classification, graphhtml]
Example #28
0
def _lda4(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=5,
          num_topic_word=10,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    # generate model
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    log_likelihood = lda_model.score(term_count)
    perplexity = lda_model.perplexity(term_count)

    # create topic table
    vocab_weights_list = []
    vocab_list = []
    weights_list = []
    topic_term_prob = normalize(lda_model.components_, norm='l1')
    for vector in topic_term_prob:
        pairs = []
        for term_idx, value in enumerate(vector):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        vocab_weights = []
        vocab = []
        weights = []
        for pair in pairs[:num_topic_word]:
            vocab_weights.append("{}: {}".format(pair[1], pair[0]))
            vocab.append(pair[1])
            weights.append(pair[0])
        vocab_weights_list.append(vocab_weights)
        vocab_list.append(vocab)
        weights_list.append(weights)
    topic_table = pd.DataFrame({
        'vocabularies_weights': vocab_weights_list,
        'vocabularies': vocab_list,
        'weights': weights_list
    })
    topic_table['index'] = [idx + 1 for idx in topic_table.index]
    topic_table = topic_table[[
        'index', 'vocabularies_weights', 'vocabularies', 'weights'
    ]]

    # create output table
    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    topic_dist_name = topic_name + '_distribution'
    if topic_name in table.columns or topic_dist_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [
        doc_topic[i].argmax() + 1 for i in range(len(corpus))
    ]
    out_table[topic_dist_name] = doc_topic.tolist()

    # pyLDAvis
    prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer)
    html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'Number of topics': num_topic,
        'Number of words for each topic': num_topic_word,
        'Maximum number of iterations': max_iter,
        'Learning method': learning_method,
        'Learning offset': learning_offset,
        'Seed': random_state
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Latent Dirichlet Allocation Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Log Likelihood
    | {log_likelihood}
    |
    | ### Perplexity
    | {perplexity}
    |
    | ### Parameters
    | {params}
    """.format(log_likelihood=log_likelihood,
               perplexity=perplexity,
               params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['lda_model'] = lda_model
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Example #29
0
nmf = NMF(n_components=nbr_topics,
          random_state=1,
          alpha=.1,
          l1_ratio=.5,
          init='nndsvd').fit(tfidf)
nmf_topics = nmf.transform(tfidf)
lda = LatentDirichletAllocation(n_components=nbr_topics,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(tf)
lda_topics = lda.transform(tf)

# Plot pretty LDA output
lda_vis_data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
lda_vis_data_html = pyLDAvis.prepared_data_to_html(lda_vis_data)
pyLDAvis.show(lda_vis_data)

# Assume top 10 words will be used per topic
top_n = 10
total_rows = nbr_topics * top_n
topic = []
topic_top_ten = []
topic_top_ten_scores = []
for tid, t in enumerate(lda.components_):
    topic.append([tid + 1] * top_n)
    topic_top_ten.append(
        [tf_feature_names[i] for i in t.argsort()[:-top_n - 1:-1]])
    topic_top_ten_scores.append(t[t.argsort()[:-top_n - 1:-1]])
top_words = np.concatenate([
    np.array(topic).reshape(total_rows, 1),
Example #30
0
    def visualize_lda_model(self, lda_model, corpus, id2word):
        data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
        html = pyLDAvis.prepared_data_to_html(data)

        return html
Example #31
0
def model_params(clicks, topics, iterations, tags, gender, rank, rel, years):

    # Lists all triggered callbacks
    changed_id = [p['prop_id'] for p in dash.callback_context.triggered][0]

    # Only runs the model training if the button has been clicked
    if 'button' in changed_id:

        # Non-filtered data
        data = df

        # Filters the data based on user's choices
        if len(tags) != len(pos_tags):
            data = tm.filter_by_tag(df, tags)
        if gender != 'A':
            data = tm.filter_by_sex(data, gender)
        if len(rank) != len(rank_set):
            data = tm.filter_by_rank(data, rank)
        if len(rel) != len(rel_set):
            data = tm.filter_by_rel(data, rel)
        if years[0] is not min(years_set) or years[1] is not max(years_set):
            data = tm.filter_by_time(data, years)

        # Data preprocessing for the LDA model
        corpus, dictionary, docs, strings = tm.prepare_data(data)
        # Creates the LDA topic model
        model, top_topics = tm.train_lda(corpus, dictionary, topics,
                                         iterations)

        dominant_topics = tm.letter_topics(model, corpus, strings)

        letters_for_topics = tm.get_most_representative(dominant_topics)

        letters_per_topic = tm.letters_per_topic(dominant_topics)

        # Loop that creates a dataframe from the LDA top topics list
        i = 1
        i = 1
        topic_dict = {}
        for topic in top_topics:
            entries = []
            for t in topic[0]:
                score = round(float(t[0]), 3)
                tmp = t[1] + ', ' + str(score)
                entries.append(tmp)
            topic_dict['Topic {}'.format(i)] = entries
            i += 1

        dataframe = pd.DataFrame(topic_dict)
        cols = [{"name": i, "id": i} for i in dataframe.columns]
        cols2 = [{"name": i, "id": i} for i in letters_for_topics.columns[1:]]
        cols3 = [{"name": i, "id": i} for i in letters_per_topic.columns[1:]]

        # Creates the pyLDAvis visualisation of the LDA model
        vis_data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
        html_vis = pyLDAvis.prepared_data_to_html(vis_data,
                                                  template_type='general')

        return dataframe.to_dict('records'), cols, letters_for_topics.to_dict(
            'records'), cols2, letters_per_topic.to_dict(
                'records'), cols3, html_vis

    else:
        return no_update, no_update, no_update, no_update, no_update, no_update, no_update