Example #1
0
# remove puncts
docx.remove_puncts().text

# remove puncts
docx.remove_puncts(most_common = False).text

# remove puncts
docx.remove_puncts(most_common = True).text

nfx.remove_userhandles(s)

nfx.remove_hashtags(s)

nfx.remove_special_characters(s)

nfx.remove_stopwords(s)

#clean text
s2 = '@mandagoforth me bad! Its funny though. Zachary Quinto is only there for a few though.  & to reply just put the @ symbol before the name!'

s2

nfx.clean_text(s2)

nfx.clean_text(s2, puncts=False, stopwords=True, emails=True, urls=True)

#working on datatset
data

# moise scan
data['twitts'].apply(lambda x: nt.TextFrame(x).noise_scan()['text_noise'])
Example #2
0
def main():
    # st.title("StreamBible")
    stc.html(HTML_BANNER)
    menu = ["Home", "MultiVerse", "About"]

    df = load_bible("data/KJV_Bible.csv")

    choice = st.sidebar.selectbox("Menu", menu)

    if choice == "Home":
        st.subheader("Single Verse Search")
        # st.dataframe(df)

        book_list = df['book'].unique().tolist()
        book_name = st.sidebar.selectbox("Books", book_list)
        chapter = st.sidebar.number_input("Chapter", 1)
        verse = st.sidebar.number_input("Verse", 1)
        bible_df = df[df['book'] == book_name]
        # st.dataframe(bible_df)

        # Layout
        c1, c2 = st.beta_columns([2, 1])
        # Single Verse Layout
        with c1:
            try:
                selected_passage = bible_df[(bible_df["chapter"] == chapter)
                                            & (bible_df["verse"] == verse)]
                passage_details = "{0} Chapter::{1} Verse::{2}".format(
                    book_name, chapter, verse)
                st.info(passage_details)
                passage = "{}".format(selected_passage["text"].values[0])
                st.write(passage)
            except Exception as e:
                st.warning(e.args)
            except:
                st.warning("Book out of Range")

        with c2:
            st.success("Verse of the Day")
            chapter_list = range(10)
            verse_list = range(20)
            ch_choice = random.choice(chapter_list)
            vs_choice = random.choice(verse_list)
            random_book_name = random.choice(book_list)

            st.write("Book:{},Ch:{},Vs:{}".format(random_book_name, ch_choice,
                                                  vs_choice))
            rand_bible_df = df[df["book"] == random_book_name]

            try:
                randomly_selected_passage = rand_bible_df[
                    (rand_bible_df["chapter"] == ch_choice)
                    & (rand_bible_df["verse"] == vs_choice)]
                mytext = randomly_selected_passage["text"].values[0]
            except:
                mytext = rand_bible_df[(rand_bible_df["chapter"] == 1) & (
                    rand_bible_df["verse"] == 1)]["text"].values[0]

            # st.write(mytext)

            stc.html(HTML_RANDOM_TEMPLATE.format(mytext), height=300)

        # Search Topic/Term
        search_term = st.text_input("Term/Topic")
        with st.beta_expander("View Results"):
            retrieved_df = df[df["text"].str.contains(search_term)]
            st.dataframe(retrieved_df[["book", "chapter", "verse", "text"]])

    if choice == "MultiVerse":
        st.subheader("MultiVerse Retrieval")
        book_list = df["book"].unique().tolist()
        book_name = st.sidebar.selectbox("Book", book_list)
        chapter = st.sidebar.number_input("Chapter", 1)
        bible_df = df[df["book"] == book_name]
        all_verse = bible_df["verse"].unique().tolist()
        verse = st.sidebar.multiselect("Verse", all_verse, default=1)
        selected_passage = bible_df.iloc[verse]
        st.dataframe(selected_passage)
        passage_details = "{} Chapter::{} Verse::{}".format(
            book_name, chapter, verse)
        st.info(passage_details)

        # Layout
        col1, col2 = st.beta_columns(2)
        # Join all text as a sentence
        docx = " ".join(selected_passage["text"].unique().tolist())

        with col1:
            st.info("Details")
            for row in selected_passage.iterrows():
                st.write(row["text"])

        with col2:
            st.success("StudyMode")
            with st.beta_expander("Visualize Entities"):
                # st.write(docx)
                render_entities(docx)

            with st.beta_expander("Visualize Pos Tags"):
                tagged_docx = get_tags(docx)
                processed_tags = mytag_visualizer(tagged_docx)
                # st.write(processed_tags)  # Raw
                stc.html(processed_tags, height=1000, scrolling=True)

            with st.beta_expander("Keywords"):
                processed_docx = nfx.remove_stopwords(docx)
                keywords_tokens = get_most_common_tokens(processed_docx, 5)
                st.write(keywords_tokens)

        with st.beta_expander("Verse Curve"):
            plot_mendelhall_curve(docx)

        with st.beta_expander("Word Freq Plot"):
            plot_word_freq_with_altair(docx)

        with st.beta_expander("Pos Tags Plot"):
            tagged_docx = get_tags(docx)
            tagged_df = pd.DataFrame(tagged_docx, columns=["Tokens", "Tags"])
            # st.dataframe(tagged_df)
            df_tag_count = tagged_df["Tags"].value_counts().to_frame("counts")
            df_tag_count["tag_type"] = df_tag_count.index
            # st.dataframe(df_tag_count)

            c = alt.Chart(df_tag_count).mark_bar().encode(x="tag_type",
                                                          y="counts")
            st.altair_chart(c, use_container_width=True)

    else:
        st.subheader("About")
        st.text("Build with Streamlit")
        st.text("Example from Jesse E.Agbe(JCharis)")
        st.success(cat)
Example #3
0
def main():
	"""A Simple Summarization NLP App"""
	st.title("NLP App with Streamlit")
	menu = ['Home', 'NLP(files)','About']
	choice = st.sidebar.selectbox("Menu",menu)
	if choice == 'Home':
		st.subheader("Home: Analyse Text")
		raw_text = st.text_area("Enter Text Here")
		num_of_most_common = st.sidebar.number_input("Most Common Tokens",5,15)
		if st.button("Analyze"):
			
			with st.beta_expander("Original Text"):
				st.write(raw_text)
			
			with st.beta_expander("Text Analysis"):
				token_result_df = text_analyzer(raw_text)
				st.dataframe(token_result_df)
			
			with st.beta_expander("Entities"):
				# entity_result = get_entities(raw_text)
				# st.write(entity_result)

				entity_result = render_entities(raw_text)
				# st.write(entity_result)
				stc.html(entity_result, height=1000, scrolling=True)

			# Layouts
			col1, col2 = st.beta_columns(2)

			with col1:
				with st.beta_expander("Word Stats"):
					st.info("Word Statistics")
					docx = nt.TextFrame(raw_text)
					st.write(docx.word_stats())

				with st.beta_expander("Top Keywords"):
					st.info("Top Keywords/Tokens")
					processed_text = nfx.remove_stopwords(raw_text)
					keywords = get_most_common_tokens(processed_text, num_of_most_common)
					st.write(keywords)

				with st.beta_expander("Sentiment"):
					sent_result = get_sentiment(raw_text)
					st.write(sent_result)

			with col2:
				with st.beta_expander("Plot Word Freq"):
					fig = plt.figure()
					# sns.countplot(token_result_df['Token'])
					
					top_keywords = get_most_common_tokens(processed_text, num_of_most_common)
					plt.bar(keywords.keys(),top_keywords.values())
					plt.xticks(rotation=45)
					st.pyplot(fig)


				with st.beta_expander("Plot Part of Speech"):
					fig = plt.figure()
					sns.countplot(token_result_df['PoS'])
					plt.xticks(rotation=45)
					st.pyplot(fig)

				with st.beta_expander("Plot Wordcloud"):
					plot_wordcloud(raw_text)
				
			with st.beta_expander("Download Text Analysis Result"):
				make_downloadable(token_result_df)





	elif choice == 'NLP(files)':
		st.subheader("NLP Task")

		text_file = st.file_uploader("Upload Files",type=['pdf','docx','txt'])
		num_of_most_common = st.sidebar.number_input("Most Common Tokens",5,15)
		
		if text_file is not None:
			if text_file.type == 'application/pdf':
				raw_text = read_pdf(text_file)
				# st.write(raw_text)
			elif text_file.type == 'text/plain':
				# st.write(text_file.read()) # read as bytes
				raw_text = str(text_file.read(),"utf-8")
				# st.write(raw_text)
			else:
				raw_text = docx2txt.process(text_file)
				# st.write(raw_text)

			with st.beta_expander("Original Text"):
				st.write(raw_text)
			
			with st.beta_expander("Text Analysis"):
				token_result_df = text_analyzer(raw_text)
				st.dataframe(token_result_df)
			
			with st.beta_expander("Entities"):
				# entity_result = get_entities(raw_text)
				# st.write(entity_result)

				entity_result = render_entities(raw_text)
				# st.write(entity_result)
				stc.html(entity_result, height=1000, scrolling=True)

			# Layouts
			col1, col2 = st.beta_columns(2)

			with col1:
				with st.beta_expander("Word Stats"):
					st.info("Word Statistics")
					docx = nt.TextFrame(raw_text)
					st.write(docx.word_stats())

				with st.beta_expander("Top Keywords"):
					st.info("Top Keywords/Tokens")
					processed_text = nfx.remove_stopwords(raw_text)
					keywords = get_most_common_tokens(processed_text, num_of_most_common)
					st.write(keywords)

				with st.beta_expander("Sentiment"):
					sent_result = get_sentiment(raw_text)
					st.write(sent_result)

			with col2:
				with st.beta_expander("Plot Word Freq"):
					fig = plt.figure()
					# sns.countplot(token_result_df['Token'])
					
					top_keywords = get_most_common_tokens(processed_text, num_of_most_common)
					plt.bar(keywords.keys(),top_keywords.values())
					plt.xticks(rotation=45)
					st.pyplot(fig)


				with st.beta_expander("Plot Part of Speech"):
					try:	
						fig = plt.figure()
						sns.countplot(token_result_df['PoS'])
						plt.xticks(rotation=45)
						st.pyplot(fig)
					except:
						st.warning("Insufficient Data")

				with st.beta_expander("Plot Wordcloud"):
					plot_wordcloud(raw_text)
				
			with st.beta_expander("Download Text Analysis Result"):
				make_downloadable(token_result_df)




	else:
		st.subheader("About")
Example #4
0
def main():
    st.set_page_config(page_title="20 in 1 NLP tasks",layout='wide')
    #st.title('NER recognition app')
    options=['Home','Analysis','Custom text cleaning','Question and answering','Text summarization','Email extractor','Spelling correction',
             'Text generation','About']
    choice=st.sidebar.selectbox('Chose accordingly',options)


    if choice=='Home':
        image=Image.open('1_a3xerDP7jqQglKxfIxfxVw.jpeg')
        st.image(image)
        st.header('Multi **NLP** tasks in a single window')
        st.write("""
        # This web App contains different text analysis with visual representation and advance tasks like QnA and Text Generation
        """)




    elif choice=='Analysis':
        st.subheader('Upload document')

        doc_file = st.file_uploader('', type=['csv', 'pdf', 'text', 'docx'])

        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)

            else:
                raw_text = docx2txt.process(doc_file)

        elif doc_file is None:
            st.subheader('Or enter your input')
            raw_text = st.text_area(' ')

        
        if st.sidebar.checkbox('Analyze'):
            num_of_most_common=st.sidebar.number_input('Most common tokens',5,15)
            with st.beta_expander('Original text'):
                st.write(raw_text)

            with st.beta_expander('Basic Text Analysis'):
                data=text_analyzer(raw_text)
                st.dataframe(data)


            col1,col2=st.beta_columns(2)

            with col1:
                with st.beta_expander('Word Stats'):
                    st.info('Words statistics')
                    doc=nt.TextFrame(raw_text)
                    st.write(doc.word_stats())
                with st.beta_expander("Top Keywords"):
                    st.info("Top Keywords/Tokens")
                    processed_text = nfx.remove_stopwords(raw_text)
                    keywords = get_most_common_tokens(
                        processed_text, num_of_most_common
                    )
                    st.write(keywords)

                with st.beta_expander("Sentiment"):
                    sent_result = sentiment(raw_text)
                    st.write(sent_result)

            with col2:
                with st.beta_expander("Plot Word Freq"):
                    fig = plt.figure()
                    top_keywords = get_most_common_tokens(
                        processed_text, num_of_most_common
                    )
                    plt.bar(keywords.keys(), top_keywords.values())
                    plt.xticks(rotation=45)
                    st.pyplot(fig)

                with st.beta_expander('Plot of part of speech'):
                    fig=plt.figure()
                    sns.countplot(data['PoS'])
                    plt.xticks(rotation=45)
                    st.pyplot(fig)
                with st.beta_expander('Word Cloud Visualization'):
                    plot_wordcloud(raw_text)

        if st.sidebar.checkbox('Name Entity Recognition'):
            doc = nlp(raw_text)
            spacy_streamlit.visualize_ner(doc, labels=nlp.get_pipe('ner').labels,
                                          attrs=['text', 'label_', 'start', 'end'])




    elif choice=='Custom text cleaning':
        st.subheader('Custom text cleaning')
        doc_file = st.file_uploader('', type=['csv', 'pdf', 'text', 'docx'])

        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)

            else:
                raw_text = docx2txt.process(doc_file)

        elif doc_file is None:
            st.subheader('Or enter your input')
            raw_text = st.text_area(' ')

        normalization = st.sidebar.checkbox('Text normalization')
        clean_stopwards = st.sidebar.checkbox('Remove stopwords')
        clean_punctuation = st.sidebar.checkbox('Remove punctuation')
        clean_numreric = st.sidebar.checkbox('Remove numbers')
        clean_special = st.sidebar.checkbox('Remove special characters')
        clean_url = st.sidebar.checkbox('Clean URLs')

        if st.button('Start process'):



            col1,col2=st.beta_columns(2)
            with col1:
                with st.beta_expander('Original text'):
                    st.write('The length is :',len(raw_text))
                    st.write(raw_text)

            with col2:
                with st.beta_expander('Processed text'):
                    if normalization:
                        raw_text=raw_text.lower()
                    if clean_stopwards:
                        raw_text=nfx.remove_stopwords(raw_text)
                    if clean_url:
                        raw_text=nfx.remove_urls(raw_text)
                    if clean_special:
                        raw_text=nfx.remove_special_characters(raw_text)
                    if clean_punctuation:
                        raw_text=nfx.remove_punctuations(raw_text)
                    if clean_numreric:
                        raw_text=nfx.remove_numbers(raw_text)
                    st.write('The length is :',len(raw_text))
                    st.write(raw_text)




    elif choice=='Text summarization':
        st.subheader('Extractive text summarization')
        doc_file = st.file_uploader('Upload', type=['csv', 'pdf', 'text', 'docx'])
        #

        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)
            else:
                raw_text = docx2txt.process(doc_file)
        elif doc_file is None:
            raw_text = st.text_area('Or enter your input manually')

        if st.button("Summarize"):
            with st.beta_expander("Original Text"):
                st.write(raw_text)
            c1, c2 = st.beta_columns(2)

            with c1:
                with st.beta_expander("LexRank Summary"):
                    my_summary = sumy_summarizer(raw_text)
                    document_len = {"Original": len(raw_text),
                                    "Summary": len(my_summary)}
                    st.write(document_len)
                    st.write(my_summary)

                    st.info("Rouge Score")
                    eval_df = evaluate_summary(my_summary, raw_text)
                    #st.dataframe(eval_df.T)
                    eval_df['metrics'] = eval_df.index
                    c = alt.Chart(eval_df).mark_bar().encode(
                        x='metrics', y='rouge-1')
                    st.altair_chart(c)

            with c2:
                with st.beta_expander("Frequency based summary"):
                    summary=freq_summarization(raw_text)
                    document_len = {"Original": len(raw_text),
                                    "Summary": len(summary)}
                    st.write(document_len)
                    st.write(summary)
                    st.info("Rouge Score")
                    eval_df = evaluate_summary(summary, raw_text)
                    #st.dataframe(eval_df.T)
                    eval_df['metrics'] = eval_df.index
                    c = alt.Chart(eval_df).mark_bar().encode(
                        x='metrics', y='rouge-1')
                    st.altair_chart(c)




    # elif choice=='Document similarity':
    #     st.subheader('Document similarity check')

    #     doc_file_1 = st.file_uploader('Upload first document', type=['csv', 'pdf', 'text', 'docx'])
    #     if doc_file_1 is not None:
    #         file_details = doc_file_1.type
    #         if file_details == 'text/plain':
    #             raw_text_1 = str(doc_file_1.read(), 'utf-8')
    #         elif file_details == 'application/pdf':
    #             raw_text_1 = read_pdf(doc_file_1)
    #         else:
    #             raw_text_1 = docx2txt.process(doc_file_1)
    #     elif doc_file_1 is None:
    #         raw_text_1 = st.text_area('Upload first document manually')

    #     doc_file_2 = st.file_uploader('Upload second document', type=['csv', 'pdf', 'text', 'docx'])
    #     if doc_file_1 is not None:
    #         file_details = doc_file_2.type
    #         if file_details == 'text/plain':
    #             raw_text_2 = str(doc_file_2.read(), 'utf-8')
    #         elif file_details == 'application/pdf':
    #             raw_text_2 = read_pdf(doc_file_2)
    #         else:
    #             raw_text_2 = docx2txt.process(doc_file_2)
    #     elif doc_file_2 is None:
    #         raw_text_2 = st.text_area('Upload second document manually')

    #     a=embed_fn([raw_text_1])
    #     b=embed_fn([raw_text_2])
    #     cosine=cosine_similarity(a,b)[0][0]*100
    #     if st.button('Calculate similarity'):
    #         st.write(f'The similarity is {round(cosine,2)} %')




    elif choice=='Email extractor':
        st.subheader('Email extractor')
        doc_file = st.file_uploader('Upload', type=['csv', 'pdf', 'text', 'docx'])
        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            else:
                raw_text = docx2txt.process(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
        elif doc_file is None:
            raw_text = st.text_area('Enter your input')


        tasks_list = ["Emails"]
        task_option = st.sidebar.multiselect("Task", tasks_list, default="Emails")
        task_mapper = {"Emails": nfx.extract_emails(raw_text)}

        all_results = []
        for task in task_option:
            result = task_mapper[task]
            # st.write(result)
            all_results.append(result)
        st.write(all_results)

        with st.beta_expander("Results As DataFrame"):
            result_df = pd.DataFrame(all_results).T
            result_df.columns = task_option
            st.dataframe(result_df)
            #make_downloadable_df(result_df)

    elif choice=='Spelling correction':
        st.subheader('Spell checker and corrector')
        doc_file = st.file_uploader('Upload', type=['csv', 'pdf', 'text', 'docx'])
        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            else:
                raw_text = docx2txt.process(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
        elif doc_file is None:
            raw_text = st.text_area('Enter your input')

        spell = SpellChecker()
        misspelled_word_list = raw_text.split()
        misspelled_word = spell.unknown(misspelled_word_list)
        b = spell.correction(raw_text)
        if st.button('Get corrected output'):
            st.write(b)
        if st.button('Analyze'):
            for word in misspelled_word:
                if word != spell.correction(word):
                    st.write('Original word:', word)
                    st.write('correct word:', spell.correction(word))
                    st.write('Suggested words:', spell.candidates(word))
                    #st.write('\n')





    elif choice=='Question and answering':
        st.subheader('Question and Answering system')

        doc_file=st.file_uploader('Upload',type=['csv','pdf','text','docx'])
        #


        if doc_file is not None:
            file_details=doc_file.type
            if file_details=='text/plain':
                raw_text=str(doc_file.read(),'utf-8')
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            elif file_details=='application/pdf':
                raw_text=read_pdf(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            else:
                raw_text=docx2txt.process(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
        elif doc_file is None:
            raw_text = st.text_area('Enter your input')

        st.subheader('Enter your question')
        question=st.text_area('What"s in your mind?')


        # if st.button('Generate answer'):
        #
        #     qna=QnA(question,raw_text)
        #     st.write(qna)

    elif choice=='Text generationText generation':
        pass

    else:
        st.header('About')
        st.write('''
        # This web application is built by *Arindam Mondal* , a student of Masters in Data Analytics.''')
def main():
    st.title('Text Cleaner App')

    menu = ["TextCleaner", "About"]
    choice = st.sidebar.selectbox("Menu", menu)

    if choice == 'TextCleaner':
        st.subheader("Text Cleaning")
        text_file = st.file_uploader("Upload Text File", type=['txt'])
        normalize_case = st.sidebar.checkbox("Normalize Case")
        clean_stopwords = st.sidebar.checkbox('Stopwords')
        clean_punctuations = st.sidebar.checkbox('Punctuations')
        clean_emails = st.sidebar.checkbox('Emails')
        clean_special_char = st.sidebar.checkbox('Special Characters')
        clean_numbers = st.sidebar.checkbox('Numbers')
        clean_urls = st.sidebar.checkbox('Urls')
        clean_emojis = st.sidebar.checkbox('Emojis')

        if text_file is not None:
            file_details = {
                "Filename": text_file.name,
                "Filesize": text_file.size,
                "Filetype": text_file.type
            }
            st.write(file_details)

            #Decode Text
            raw_text = text_file.read().decode('utf-8')
            col1, col2 = st.beta_columns(2)

            with col1:
                with st.beta_expander("Orginal Text"):
                    st.write(raw_text)

            with col2:
                with st.beta_expander("Processed Text"):
                    if normalize_case:
                        raw_text = raw_text.lower()

                    if clean_stopwords:
                        raw_text = nfx.remove_stopwords(raw_text)

                    if clean_numbers:
                        raw_text = nfx.remove_numbers(raw_text)

                    if clean_urls:
                        raw_text = nfx.remove_urls(raw_text)

                    if clean_punctuations:
                        raw_text = nfx.remove_punctuations(raw_text)

                    if clean_special_char:
                        raw_text = nfx.remove_special_characters(raw_text)

                    if clean_emails:
                        raw_text = nfx.remove_emails(raw_text)

                    if clean_emojis:
                        raw_text = nfx.remove_emojis(raw_text)

                    st.write(raw_text)

                    text_downloader(raw_text)

    else:
        st.subheader("About")
Example #6
0
def main():
    st.title("Text Analysis NLP _Beta v1.0")
    menu = ["Home", "Upload", "About"]

    choice = st.sidebar.selectbox("NLP Menu", menu)
    if choice == "Home":
        st.write(
            "Our day to day language can tell you an aboard patterns, insights and sentiments. Explore the power of Ai: Natural Language Processing algorithm and discover synchronicity that leads one to another. Free to use as much as you like! under GNU General Public License with a Motto #WeRiseByLiftingOthers"
        )
        st.write(
            "ML Analytics[@heroku streamlit-roy](https://streamlit-roy.herokuapp.com/)  Sample Dataset [@rupak-roy Github](https://github.com/rupak-roy/dataset-streamlit) "
        )
        st.write(
            "V3 update: Deep Learning module at [@share.streamlit.io] (https://share.streamlit.io/rupak-roy/streamlit_deeplearning_analytics/main/ML.py)"
        )

        raw_text = st.text_area("Enter Text Here")
        num_of_most_common = st.sidebar.number_input("Min Common Keywords", 5,
                                                     15)
        if st.button("Analyze"):

            #       with st.beta_expander("Original Text"):
            #          st.write(raw_text)
            with st.beta_expander("Text Analysis"):
                token_result_df = text_analyzer(raw_text)
                st.dataframe(token_result_df)

            with st.beta_expander("Entities Explorer"):
                # entity_result = get_entities(raw_text)
                # st.write(entity_result)

                entity_result = render_entities(raw_text)
                stc.html(entity_result, height=300, scrolling=True)

            with st.beta_expander("Summary using LexRank Approach"):
                st.text(
                    "Disclaimer: LexRank is an unsupervised approach to text summarization based on graph-based centrality scoring of sentences. The main idea is that sentences “recommend” other similar sentences to the reader. Thus, if one sentence is very similar to many others, it will likely be a sentence of great importance. The importance of this sentence also stems from the importance of the sentences “recommending” it. Thus, to get ranked highly and placed in a summary, a sentence must be similar to many sentences that are in turn also similar to many other sentences. This makes intuitive sense and allows the algorithms to be applied to any arbitrary new text."
                )
                my_summary = sumy_summarizer(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)

                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df.T)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            with st.beta_expander("Summary using TextRank Approach"):
                st.text(
                    "Note: One of the famous Text Summarization algorithm gets its name from Larry Page, one of the co-founders of Google."
                )
                my_summary = summarize(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)
                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            # Layouts
            col1, col2 = st.beta_columns(2)

            with col1:

                with st.beta_expander("Word Statistics"):
                    st.info("Word Statistics")
                    docx = nt.TextFrame(raw_text)
                    st.write(docx.word_stats())

                with st.beta_expander("Top Keywords/Tokens"):
                    st.info("Top Keywords/Tokens")
                    processed_text = nfx.remove_stopwords(raw_text)
                    keywords = get_most_common_tokens(processed_text,
                                                      num_of_most_common)
                    st.write(keywords)

                with st.beta_expander("Sentiment Explorer"):
                    st.info("Sentiment Analysis")
                    sent_result = get_sentiment(raw_text)
                    st.write(sent_result)

            with col2:

                with st.beta_expander("Word Frequency Graph"):
                    fig = plt.figure()
                    top_keywords = get_most_common_tokens(
                        processed_text, num_of_most_common)
                    plt.bar(keywords.keys(), top_keywords.values())
                    plt.xticks(rotation=45)
                    st.pyplot(fig)

                with st.beta_expander("Part of Speech(PoS) Graph"):
                    try:
                        fig = plt.figure()
                        sns.countplot(token_result_df["PoS"])
                        plt.xticks(rotation=45)
                        st.pyplot(fig)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Plot Wordcloud"):
                    try:
                        plot_wordcloud(raw_text)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Stylography Explorer"):
                    st.info("using Mendelhall Curve")
                    plot_mendelhall_curve_2(raw_text)

            with st.beta_expander("Download The Analysis Report"):
                make_downloadable(token_result_df)

    elif choice == "Upload":
        st.write(
            "Our day to day language can tell you an aboard patterns, insights and sentiments. Explore the prower of Ai: Natural Language Processing algorithim and discover synchronicity that leads one to another. Free to use as much as you like! under GNU General Public License with a Motto #WeRiseByLiftingOthers"
        )
        st.write(
            "ML Analytics[@heroku streamlit-roy](https://streamlit-roy.herokuapp.com/)  Sample Dataset [@rupak-roy Github](https://github.com/rupak-roy/dataset-streamlit) "
        )
        st.write(
            "V3 update: Deep Learning module at [@share.streamlit.io] (https://share.streamlit.io/rupak-roy/streamlit_deeplearning_analytics/main/ML.py)"
        )

        text_file = st.file_uploader("Upload Files",
                                     type=["pdf", "docx", "txt"])
        num_of_most_common = st.sidebar.number_input("Min Common Keywords", 5,
                                                     15)

        if text_file is not None:
            if text_file.type == "application/pdf":
                raw_text = read_pdf(text_file)
                # st.write(raw_text)
            elif text_file.type == "text/plain":
                # st.write(text_file.read()) # read as bytes
                raw_text = str(text_file.read(), "utf-8")
                # st.write(raw_text)
            else:
                raw_text = docx2txt.process(text_file)
                # st.write(raw_text)

            with st.beta_expander("Original Text"):
                st.write(raw_text)

            with st.beta_expander("Text Analysis"):
                token_result_df = text_analyzer(raw_text)
                st.dataframe(token_result_df)

            with st.beta_expander("Entities Explorer"):
                # entity_result = get_entities(raw_text)
                # st.write(entity_result)

                entity_result = render_entities(raw_text)
                stc.html(entity_result, height=300, scrolling=True)

            with st.beta_expander("Summary using LexRank Approach"):
                st.text(
                    "Disclaimer: LexRank is an unsupervised approach to text summarization based on graph-based centrality scoring of sentences. The main idea is that sentences “recommend” other similar sentences to the reader. Thus, if one sentence is very similar to many others, it will likely be a sentence of great importance. The importance of this sentence also stems from the importance of the sentences “recommending” it. Thus, to get ranked highly and placed in a summary, a sentence must be similar to many sentences that are in turn also similar to many other sentences. This makes intuitive sense and allows the algorithms to be applied to any arbitrary new text."
                )
                my_summary = sumy_summarizer(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)

                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df.T)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            with st.beta_expander("Summary using TextRank Approach"):
                st.text(
                    "Note: One of the famous Text Summarization algorithm gets its name from Larry Page, one of the co-founders of Google."
                )
                my_summary = summarize(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)

                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            # Layouts
            col1, col2 = st.beta_columns(2)

            with col1:
                with st.beta_expander("Word Statistics"):
                    st.info("Word Statistics")
                    docx = nt.TextFrame(raw_text)
                    st.write(docx.word_stats())

                with st.beta_expander("Top Keywords/Tokens"):
                    st.info("Top Keywords/Tokens")
                    processed_text = nfx.remove_stopwords(raw_text)
                    keywords = get_most_common_tokens(processed_text,
                                                      num_of_most_common)
                    st.write(keywords)

                with st.beta_expander("Sentiment Explorer"):
                    st.info("Sentiment Analysis")
                    sent_result = get_sentiment(raw_text)
                    st.write(sent_result)

            with col2:
                with st.beta_expander("Word Frequency Graph"):
                    fig = plt.figure()
                    top_keywords = get_most_common_tokens(
                        processed_text, num_of_most_common)
                    plt.bar(keywords.keys(), top_keywords.values())
                    plt.xticks(rotation=45)
                    st.pyplot(fig)

                with st.beta_expander("Part of Speech(Pos) Graph"):
                    try:

                        fig = plt.figure()
                        sns.countplot(token_result_df["PoS"])
                        plt.xticks(rotation=45)
                        st.pyplot(fig)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Plot Wordcloud"):
                    try:
                        plot_wordcloud(raw_text)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Stylography Explorer"):
                    st.info("using Mendelhall Curve")
                    plot_mendelhall_curve_2(raw_text)

            if st.sidebar.checkbox("Top Keywords/Tokens"):
                st.info("Top Keywords/Tokens")
                processed_text = nfx.remove_stopwords(raw_text)
                keywords = get_most_common_tokens(processed_text,
                                                  num_of_most_common)
                st.write(keywords)

            if st.sidebar.checkbox("Part of Speech(Pos) Graph"):
                fig = plt.figure()
                sns.countplot(token_result_df["PoS"])
                plt.xticks(rotation=45)
                st.pyplot(fig)

            if st.sidebar.checkbox("Sentiment Analysis"):
                st.info("Sentiment Analysis")
                sent_result = get_sentiment(raw_text)
                st.write(sent_result)

            if st.sidebar.checkbox("Stylography Analysis"):
                st.info("using Mendelhall Curve")
                plot_mendelhall_curve_2(raw_text)

            if st.sidebar.checkbox("Plot Word Frequency Graph"):
                fig = plt.figure()
                top_keywords = get_most_common_tokens(processed_text,
                                                      num_of_most_common)
                plt.bar(keywords.keys(), top_keywords.values())
                plt.xticks(rotation=45)
                st.pyplot(fig)

            if st.sidebar.checkbox("Plot WordCloud"):
                plot_wordcloud(raw_text)

            with st.beta_expander("Download The Analysis Report"):
                make_downloadable(token_result_df)

    else:
        st.subheader("About")
        st.text("Thank you for your time")

        st.markdown("""
Hi I’m Bob aka. Rupak Roy. Things i write about frequently on Quora & Linkedin: analytics For Beginners, Data Science, Machine Learning, Deep learning, Natural Language Processing (NLP), Computer Vision, Big Data Technologies, Internet Of Thins and many other random topics of interest.
I formerly Co-founded various Ai based projects to inspire and nurture the human spirit with the Ai training on how to leverage on how to leverage Ai to solve problems for an exponential growth.

My Career Contour consists of various technologies starting from Masters of Science in Information Technology to Commerce with the privilege to be Wiley certified in various Analytical Domain. My alternative internet presences, Facebook, Blogger, Linkedin, Medium, Instagram, ISSUU and with Data2Dimensions
If you wish to learn more about Data Science follow me at:

~ Medium [@rupak.roy](https://medium.com/@rupak.roy)

~ Linkedin [@bobrupak](https://www.linkedin.com/in/bobrupak/)

My Fav. Quote:

Millions saw the apple fall but only Newton asked why! ~ “Curiosity is the spark of perfection and innovations. So connect with data and discover sync“
""")
        st.image('img/prism.gif')
        with st.beta_expander("Suprise!"):
            st.title(
                "COLLECT YOUR FULL VERSION MACHINE LEARNING APP @ ping_me #socialmedia"
            )
            st.image('img/office.jpg')
            st.info("")
            st.success("")
            st.warning("")
            st.error("")
Example #7
0
def main():

	# st.title("Data Science Application Dashboard")

	menu = ['Home', 'NLP Summarization App', 'NLP Text Analysis', 'NLP Q&A App']
	choice = st.sidebar.selectbox("Menu", menu)

	# Home Page
	if choice == 'Home':
		
		st.markdown("""
						<div align='center'><h1 style="color:blue"><font size="5"> Welcome to Data Science Application Dashboard </font> </h1></div> <br>\
						<p> The purpose of this dashboard is to demonstrate few applications on natural language processing (NLP). NLP is a branch of artifical intelligence \
						that deals with the interaction between computers and humans using language. The goals for NLP is to read, understand, translate, and make sense of \
						the humana langages in a manner that is valuable. </p>

						<p> Three applications are demonstrated in the dashboard. Brief descriptions are shown in below: </p>

						<ol>
							<li> NLP Summarization App - This app will allow you to either enter an article from URL, copy and paste texts, or upload a file (.pdf, .docx, .txt). The app will apply \
							TextRank Algorithm to summarize the article and also evaluate the summary using Rouge scores. </li>
							<li>NLP Text Analysis App - This app will allow you to either enter an article from URL, copy and paste texts, or upload a file (.pdf, .docx, .txt). The app will apply \
							variety of python packages (e.g., spacy, neattext, wordcloud, etc) to analyze the article and generate word statistics, top keywords, sentiment, wordcloud, and more. </li>
							<li> NLP Q&A App - This app will allow you to either enter an article from URL or copy and paste texts. The app will read the article and you can ask any questions \
							related to the article. The app will apply cosine similarity to response few sentences that are closed related to your question. </li>
						</ol> 


						 """, unsafe_allow_html=True)



	# NLP Q&A Page
	elif choice == 'NLP Q&A App':

		st.subheader("Natural Language Processing (NLP) Q&A Application")
		file_format = ['URL','Plain Text']
		files = st.sidebar.selectbox("Input Format", file_format)

		if files == 'URL':

			text_input = st.text_area("Please Enter an URL Article")
			if text_input:

				try:

					#print(text_input)

					# Get Article
					article = Article(text_input.strip())
					article.download()
					article.parse()
					article.nlp()
					corpus  = article.text

					#print(corpus)
					#print(text)

					# Tokenization
					text = corpus
					sentence_list = nltk.sent_tokenize(text)

					# print(text)

					question = st.text_area("Please Enter a question related to the Artcile")

					if question:

						response = bot_response(user_input = question, sentence_list = sentence_list)
						st.subheader(response)

				except:

					st.warning("Please Enter a correct URL with Article")

		elif files == 'Plain Text':

			text_input = st.text_area("Please Enter Text Here")
			if text_input:

				try:

					question = st.text_area("Please Enter a question related to the Artcile")

					sentence_list = nltk.sent_tokenize(text_input)

					# print(sentence_list)

					if question:

						response = bot_response(user_input = question, sentence_list = sentence_list)
						st.subheader(response)

				except:

					st.warning("Please Enter some text")

	# NLP Summarization page
	elif choice == 'NLP Summarization App':

		st.subheader("Natural Language Processing (NLP) Summarization Application")
		file_format = ['URL','Plain Text', 'Upload a File']
		files = st.sidebar.selectbox("Input Format", file_format)

		if files == 'URL':

			text_input = st.text_area("Please Enter an URL Article")
			if st.button("Summarize"):

				try:
					# Get Article
					article = Article(text_input.strip())
					article.download()
					article.parse()
					article.nlp()
					corpus  = article.text

					#print(corpus)
					#print(text)

					# Tokenization
					text = corpus
					sentence_list = nltk.sent_tokenize(text)

					art_text = " ".join(sentence_list)

					with st.beta_expander("Original Text"):
						st.write(art_text)

					#c1, c2 = st.beta_columns(2)

					#with c1:
					#	with st.beta_expander("LexRank Summary"):
					#		pass

					with st.beta_expander("TextRank Summary"):
						textrank_sum = summarize(art_text)
						doc_length = {"Article Word Count": len(art_text), "Summary Word Count": len(textrank_sum)}
						st.write(doc_length)
						st.write(textrank_sum)


						st.info("Rouge Score")
						score = evaluate_summary(textrank_sum, art_text)
						st.dataframe(score)

				except:
					st.warning("Please Enter a correct URL with Article")

		elif files == 'Plain Text':

			text_input = st.text_area("Please Enter Text Here")
			if st.button("Summarize"):

				try:

					sentence_list = nltk.sent_tokenize(text_input)
					art_text = " ".join(sentence_list)

					with st.beta_expander("Original Text"):
						st.write(art_text)

					with st.beta_expander("TextRank Summary"):
						textrank_sum = summarize(art_text)
						doc_length = {"Article Word Count": len(art_text.split()), "Summary Word Count": len(textrank_sum.split())}
						st.write(doc_length)
						st.write(textrank_sum)


						st.info("Rouge Score")
						score = evaluate_summary(textrank_sum, art_text)
						st.dataframe(score)

				except:
					st.warning("Please Enter more sentences")

		elif files == 'Upload a File':

			text_file = st.file_uploader("Please Upload a File", type = ['pdf', 'docx', 'txt'])

			if text_file is not None:
				if text_file.type == 'application/pdf':
					text_input = read_pdf(text_file)

				elif text_file.type == 'text/plain':
					text_input = str(text_input, read(), 'utf-8')

				else:
					text_input = docx2txt.process(text_file)

				try:

					with st.beta_expander("Original Text"):
						st.write(text_input)

					with st.beta_expander("TextRank Summary"):
						textrank_sum = summarize(text_input)
						doc_length = {"Article Word Count": len(text_input.split()), "Summary Word Count": len(textrank_sum.split())}
						st.write(doc_length)
						st.write(textrank_sum)


						st.info("Rouge Score")
						score = evaluate_summary(textrank_sum, text_input)
						st.dataframe(score)

				except:
					st.warning("Please Enter more sentences")

	# NLP Text Analysis
	elif choice == 'NLP Text Analysis':

		st.subheader("Natural Language Processing (NLP) Text Analysis Application")
		file_format = ['URL','Plain Text', 'Upload a File']
		files = st.sidebar.selectbox("Input Format", file_format)

		if files == 'URL':

			text_input = st.text_area("Please Enter an URL Article")
			num_of_tokens = st.sidebar.number_input("Most Common Word", 5, 15)
			if st.button("Analyze"):

				try:
					# Get Article
					article = Article(text_input.strip())
					article.download()
					article.parse()
					article.nlp()
					corpus  = article.text

					# Tokenization
					text = corpus
					sentence_list = nltk.sent_tokenize(text)

					art_text = " ".join(sentence_list)

					# Original Text
					with st.beta_expander("Original Text"):
						st.write(art_text)

					# Text Analysis
					with st.beta_expander("Text Analysis"):
						token_df = text_analysis(art_text)
						st.dataframe(token_df)

					# Entities
					with st.beta_expander("Entities"):
						#ent_check = get_entities(art_text)
						#st.write(ent_check)

						entity_df = render_entities(art_text)
						stc.html(entity_df, height=500, scrolling=True)

					c1, c2 = st.beta_columns(2)

					
					with c1:
						# Word Statistics
						with st.beta_expander("Word Statistics"):
							st.info("Word Statistics")
							docx = nt.TextFrame(art_text)
							st.write(docx.word_stats())

						# Plot Part of Speech
						with st.beta_expander("Plot Part of Speech"):
							fig = plt.figure()
							sns.countplot(token_df['PoS'])
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Get Sentiment
						with st.beta_expander("Sentiment"):
							sent_result = get_sentiment(art_text)
							st.write(sent_result)
					
					with c2:
						# Most Common Word
						with st.beta_expander("Top Keywords"):
							st.info("Top Keywords/Tokens")
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							keyword = most_word(remove_sw, num_of_tokens)
							st.write(keyword)

						# Plot Word Freq
						with st.beta_expander("Plot Top Word Frequency"):
							fig = plt.figure()
							top_word = most_word(remove_sw, num_of_tokens)
							plt.bar(top_word.keys(), top_word.values())
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Generate WordCloud
						with st.beta_expander("Plot WordCloud"):
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							plot_wordcloud(remove_sw)

					#with st.beta_expander("Download Text Analysis Results"):
					#	make_downloadable(token_df)

				except:
					st.warning("Please Enter a correct URL with Article")

		elif files == 'Plain Text':

			text_input = st.text_area("Please Enter Text Here")
			num_of_tokens = st.sidebar.number_input("Most Common Word", 5, 15)
			if st.button("Analyze"):

				try:
					
					sentence_list = nltk.sent_tokenize(text_input)
					art_text = " ".join(sentence_list)

					# Original Text
					with st.beta_expander("Original Text"):
						st.write(art_text)

					# Text Analysis
					with st.beta_expander("Text Analysis"):
						token_df = text_analysis(art_text)
						st.dataframe(token_df)

					# Entities
					with st.beta_expander("Entities"):
						#ent_check = get_entities(art_text)
						#st.write(ent_check)

						entity_df = render_entities(art_text)
						stc.html(entity_df, height=500, scrolling=True)

					c1, c2 = st.beta_columns(2)

					
					with c1:
						# Word Statistics
						with st.beta_expander("Word Statistics"):
							st.info("Word Statistics")
							docx = nt.TextFrame(art_text)
							st.write(docx.word_stats())

						# Plot Part of Speech
						with st.beta_expander("Plot Part of Speech"):
							fig = plt.figure()
							sns.countplot(token_df['PoS'])
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Get Sentiment
						with st.beta_expander("Sentiment"):
							sent_result = get_sentiment(art_text)
							st.write(sent_result)
					
					with c2:
						# Most Common Word
						with st.beta_expander("Top Keywords"):
							st.info("Top Keywords/Tokens")
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							keyword = most_word(remove_sw, num_of_tokens)
							st.write(keyword)

						# Plot Word Freq
						with st.beta_expander("Plot Top Word Frequency"):
							fig = plt.figure()
							top_word = most_word(remove_sw, num_of_tokens)
							plt.bar(top_word.keys(), top_word.values())
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Generate WordCloud
						with st.beta_expander("Plot WordCloud"):
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							plot_wordcloud(remove_sw)

					#with st.beta_expander("Download Text Analysis Results"):
					#	make_downloadable(token_df)

				except:
					st.warning("Please Enter more sentences")

		elif files == 'Upload a File':
			
			text_file = st.file_uploader("Please Upload a File", type = ['pdf', 'docx', 'txt'])
			num_of_tokens = st.sidebar.number_input("Most Common Word", 5, 15)
			if text_file is not None:
				if text_file.type == 'application/pdf':
					text_input = read_pdf(text_file)

				elif text_file.type == 'text/plain':
					text_input = str(text_input, read(), 'utf-8')

				else:
					text_input = docx2txt.process(text_file)

				try:

					# Original Text
					with st.beta_expander("Original Text"):
						st.write(text_input)

					# Text Analysis
					with st.beta_expander("Text Analysis"):
						token_df = text_analysis(text_input)
						st.dataframe(token_df)

					# Entities
					with st.beta_expander("Entities"):
						#ent_check = get_entities(art_text)
						#st.write(ent_check)

						entity_df = render_entities(text_input)
						stc.html(entity_df, height=500, scrolling=True)

					c1, c2 = st.beta_columns(2)

					
					with c1:
						# Word Statistics
						with st.beta_expander("Word Statistics"):
							st.info("Word Statistics")
							docx = nt.TextFrame(text_input)
							st.write(docx.word_stats())

						# Plot Part of Speech
						with st.beta_expander("Plot Part of Speech"):
							fig = plt.figure()
							sns.countplot(token_df['PoS'])
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Get Sentiment
						with st.beta_expander("Sentiment"):
							sent_result = get_sentiment(text_input)
							st.write(sent_result)
					
					with c2:
						# Most Common Word
						with st.beta_expander("Top Keywords"):
							st.info("Top Keywords/Tokens")
							lower_text = text_input.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							keyword = most_word(remove_sw, num_of_tokens)
							st.write(keyword)

						# Plot Word Freq
						with st.beta_expander("Plot Top Word Frequency"):
							fig = plt.figure()
							top_word = most_word(remove_sw, num_of_tokens)
							plt.bar(top_word.keys(), top_word.values())
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Generate WordCloud
						with st.beta_expander("Plot WordCloud"):
							lower_text = text_input.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							plot_wordcloud(remove_sw)

				except:
					st.warning("The uploaded file does not have enough text")
Example #8
0
def main():
	"""Author Attribution and Verifying App"""
	stc.html(HTML_BANNER)
	menu = ["Home","About"]


	choice = st.sidebar.selectbox("Menu",menu)

	if choice == 'Home':
		st.subheader("Text Analysis")
		
		raw_text = st.text_area('Enter Text Here')
		if len(raw_text) > 2:
			col1,col2 = st.beta_columns(2)
			process_text = nfx.remove_stopwords(raw_text)
			with col1:
				with st.beta_expander('Original Text'):
					st.write(raw_text)

				with st.beta_expander("Preview Tagged Text"):
					tagged_docx = generate_tags(raw_text)
					processed_tag_docx = mytag_visualizer(tagged_docx)
					stc.html(processed_tag_docx,scrolling=True)

				with st.beta_expander("Plot Word Freq"):
					st.info("Plot For Most Common Tokens")
					most_common_tokens = get_most_common_tokens(process_text,20)
					# st.write(most_common_tokens)
					tk_df = pd.DataFrame({'tokens':most_common_tokens.keys(),'counts':most_common_tokens.values()})
					# tk_df = pd.DataFrame(most_common_tokens.items(),columns=['tokens','counts'])
					# st.dataframe(tk_df)
					# st.bar_chart(tk_df)
					brush = alt.selection(type='interval', encodings=['x'])
					c = alt.Chart(tk_df).mark_bar().encode(
						    x='tokens',
						    y='counts',
						    opacity=alt.condition(brush, alt.OpacityValue(1), alt.OpacityValue(0.7)),
						    ).add_selection(brush)
						
					st.altair_chart(c,use_container_width=True)

			with col2:
				with st.beta_expander('Processed Text'):
					
					st.write(process_text)

				with st.beta_expander("Plot Wordcloud"):
					st.info("word Cloud")
					plot_wordcloud(process_text)

				with st.beta_expander("Plot Mendelhall Curve"):
					plot_mendelhall_curve_2(raw_text)


		elif len(raw_text) == 1:
			st.warning("Insufficient Text, Minimum must be more than 1")


		




		

	elif choice == "About":
		st.subheader("Text Analysis NLP App")
def main():
    st.title("Text Analysis App")

    menu = ["English", "Chinese", "Japanese"]
    Choice = st.sidebar.selectbox("Menu", menu)

    if Choice == "English":
        st.subheader("EN")
        # Text Area
        raw_text = st.text_area("Enter Text Here")

        if st.button("Submit"):
            if len(raw_text) > 2:
                st.success("Processing")
            elif len(raw_text) == 1:
                st.warning("Insufficient Text,Minimum is 2")
            else:
                st.write("Enter Text")
        # if raw_text is not None:

        # Layout
        col1, col2 = st.beta_columns(2)
        processed_text = nfx.remove_stopwords(raw_text)
        processed_text_deep = nfx.remove_special_characters(processed_text)

        with col1:
            with st.beta_expander("Original Text"):
                st.write(raw_text)

            with st.beta_expander("Pos Tagger Text"):
                #tagged_docx = get_plot_pos_tags(processed_text)
                #st.dataframe(tagged_docx)
                st.success("Part Of Speech")
                # Components HTML
                tagged_docx = TextBlob(raw_text).tags  # generate tags

                tagged_span_color_html = mytag_visualizer(tagged_docx)
                stc.html(tagged_span_color_html, scrolling=True)
            try:
                with st.beta_expander("Plot Word Freqency"):
                    max_limit = len(processed_text.split())
                    num_of_tokens = st.number_input("Num of tokens", 10,
                                                    max_limit)
                    plot_word_freq(processed_text, num_of_tokens)
                    #plot_word_freq_with_altair(processed_text,num_of_tokens)
            except:
                pass

        with col2:

            with st.beta_expander("Processed Text"):
                processed_text = nfx.remove_stopwords(raw_text)
                st.write(processed_text_deep)
            try:
                with st.beta_expander("Plot WordCloud"):
                    st.success("Wordcloud")
                    plot_wordcloud(processed_text_deep)
            except:
                pass

            with st.beta_expander("Plot Stylpmetry Curve"):
                st.success("Mendehall Curve")
                if len(raw_text) > 2:
                    plot_mendelhall_curve(raw_text)

    elif Choice == "Chinese":
        st.subheader("中文")

        # Text Area
        raw_text = st.text_area("请输入文本")

        # Preprocess
        processed_text = preprocess_tokens_list(raw_text)

        #pseg_list_tup = {words:tags for words, tags in pseg }
        #st.write((words_tag_dic))

        # submit bottom
        if st.button("Submit"):
            if len(raw_text) > 2:
                st.success("Processing")
            elif len(raw_text) == 1:
                st.warning("Insufficient Text,Minimum is 2")
            else:
                st.warning("请输入文本")
        # Basic tf-idf

        if len(processed_text) > 1:
            st.subheader("逆向文件频率")
            most_common_keyword_extracter_idf_zn(processed_text)
            st.text("——————————————————" * 5)

        # Layout
        col1, col2 = st.beta_columns(2)
        with col1:
            with st.beta_expander("原文本"):
                st.write(raw_text)
        with col1:
            with st.beta_expander("划分词类"):
                if len(raw_text) > 1:
                    st.success("Part of Speech")
                    words_pos_tagged_dict = pos_jieba_zn(raw_text)

                    #st.write(words_pos_tagged_dict)
                    tagged_span_color_html = mytag_visualizer_zn(
                        words_pos_tagged_dict)
                    stc.html(tagged_span_color_html, scrolling=True)
                else:
                    st.warning("请输入文本")

        with col1:
            with st.beta_expander("词出现频率展示图"):
                max_limit = len(processed_text.split())
                if len(processed_text) > 1:
                    try:
                        num_of_tokens = st.number_input("num of tokens",
                                                        10,
                                                        max_value=max_limit)
                        plot_word_freq(processed_text, num_of_tokens)
                    except:
                        st.info("请输入足够的文本")
                else:
                    st.warning("请输入文本")

        with col2:
            with st.beta_expander("处理后的文本"):
                if len(processed_text) > 1:
                    st.write(processed_text)
                else:
                    st.warning("已输入的文本皆焉停用词,请再输入其他文本")

        with col2:
            with st.beta_expander("展示词云"):
                if len(processed_text) > 1:
                    st.success("Word Plot")
                    plot_wordcloud(processed_text, lang="zn")
                else:
                    st.warning("请输入文本")

        with col2:
            with st.beta_expander("词长度和出现频率展示图"):
                if len(raw_text) > 1:
                    st.success("Mendehall Curve")
                    plot_mendelhall_curve_zn(raw_text)

                else:
                    st.warning("请输入文字")

    else:
        st.subheader("JA")