Ejemplo n.º 1
0
 def build_features(self, text):
     s = pd.Series(dtype=object)
     docx = nt.TextFrame(text=text)
     # Scan Percentage of Noise(Unclean data) in text
     s.loc["text_noise"] = docx.noise_scan()["text_noise"]
     s.loc["text_length"] = docx.noise_scan()["text_length"]
     s.loc["noise_count"] = docx.noise_scan()["noise_count"]
     s.loc["vowels_count"] = docx.count_vowels()
     s.loc["consonants_count"] = docx.count_consonants()
     s.loc["stopwords_count"] = docx.count_stopwords()
     return s
Ejemplo n.º 2
0
 def preprocessing(self, text):
     docx = nt.TextFrame(text=text)
     docx = docx.remove_puncts()
     docx = docx.remove_numbers()
     docx = docx.remove_phone_numbers()
     docx = docx.remove_stopwords()
     docx = docx.remove_urls()
     docx = docx.remove_special_characters()
     docx = docx.remove_emojis()
     docx = docx.fix_contractions()
     return docx
Ejemplo n.º 3
0
dir(nt)



data = pd.read_csv('twitter4000.csv')

data

dir(nt)

s = data.iloc[4]['twitts']

s

# using textframe
docx = nt.TextFrame(s)

docx.describe()

# preview the first 10 characters
docx.head(10)

# remove stop wrods
docx.remove_stopwords().text

# remove puncts
docx.remove_puncts().text

# remove puncts
docx.remove_puncts(most_common = False).text
    def get_wall_G(self, root, list_id_str, T=30, L=10, Q=0, sigma=8, use_followees=False):
        '''

        :param list_id_str:
        :param T: Number of days to look back from the list creation time
        :param L:  Minimum number of times listed to be considered an expert
        :param Q: Minimum number of times listed in Lists associated with the topic of the List
        :param sigma: Bandwitdth of the kernel in minutes

        :return:
        '''

        assert Q == 0

        # Get the List and the  Seeker
        my_list = self.scrape_list(list_id_str=list_id_str)
        seeker = self.scrape_user(id_str=my_list['user_id_str'])
        user_dir = os.path.join(root, f'{T}_{L}_{Q}_{sigma}', 'raw', my_list['user_id_str'])
        os.makedirs(user_dir, exist_ok=True)
        G_file = os.path.join(user_dir, f'G_{list_id_str}.pkl')
        if self.use_cache and os.path.exists(G_file):
            print(f'Getting wall from cache: {G_file}')
            G = utools.load_obj(G_file)
            return G, True
        # Set init and end time of wall
        end_dt = my_list['created_at']
        init_dt = end_dt - timedelta(days=T)

        # Get the wall
        success = self.scrape_wall(user_id_str=my_list['user_id_str'])
        if not success:
            print('\n ERROR SCRAPING THE WALL')
            return None, True
        df_wall = self.get_wall_from_db(my_list['user_id_str'],
                                        init_dt=init_dt,
                                        end_dt=end_dt
                                        )

        print(f"\nMin datetime: {df_wall['datetime'].min()}")
        print(f"Max datetime: {df_wall['datetime'].max()}\n")

        # Create the graph to store the wall and add the nodes
        G = nx.DiGraph()
        G.add_node(my_list['user_id_str'])
        G.add_nodes_from(list(df_wall['tweet_user_id_str'].unique()))
        members = self.scrape_list_members(list_id_str)
        if len(members) < my_list['member_count']:
            self.activate_cache(False)
            members = self.scrape_list_members(list_id_str)
            self.activate_cache(True)

        G.add_nodes_from(members)


        # Get the users' information
        all_user_id_str_list = list(G.nodes())
        df_users = self.scrape_many_users(all_user_id_str_list)
        if len(df_users) < len(G.nodes()):
            self.activate_cache(False)
            df_users = self.scrape_many_users(all_user_id_str_list)
            self.activate_cache(True)

        if len(df_users) != len(G.nodes()):
            id_str_1 = list(df_users['id_str'].unique())
            id_str_2 = list(G.nodes())
            print(utools.list_substract(id_str_2, id_str_1))
            assert False

        # %% Add Follow edges

        if not use_followees:

            followees = self.scrape_user_followees(seeker)
            if len(followees) < int(seeker['friends_count']):
                self.activate_cache(False)
                followees = self.scrape_user_followees(seeker)
                self.activate_cache(True)

            fol_in_wall = utools.list_intersection(all_user_id_str_list, followees)
            for fol in fol_in_wall:
                G.add_edge(seeker['id_str'], fol, follow=1)
        else:
            for _, u in df_users.iterrows():
                if u['friends_count'] == 0: continue
                followees = self.scrape_user_followees(u)
                fol_in_wall = utools.list_intersection(all_user_id_str_list, followees)
                for fol in fol_in_wall:
                    G.add_edge(u['id_str'], fol, follow=1)
        # %% Compute average number of times online at the same time

        df_t_seeker = self.get_user_activity(my_list['user_id_str'], init_dt, end_dt)
        df_l_seeker = self.get_user_activity(my_list['user_id_str'], init_dt, end_dt, favs=True)
        num_posts_seeker = len(df_t_seeker) + len(df_l_seeker)

        print(f"Number of posts of seeker: {num_posts_seeker}")

        seeker_dt_list = []

        if len(df_t_seeker) > 0:
            seeker_dt_list.extend(list(df_t_seeker['datetime'].map(lambda x: x.timestamp()) / 60))
        if len(df_l_seeker) > 0:
            seeker_dt_list.extend(list(df_l_seeker['datetime'].map(lambda x: x.timestamp()) / 60))
        G.nodes[seeker['id_str']]['dt_online'] = np.array(seeker_dt_list)

        for u_id_str, df_a in df_wall.groupby('tweet_user_id_str'):
            df_online = []
            if len(df_a) > 0:
                df_online = list(df_a['datetime'].map(lambda x: x.timestamp()) / 60)
            G.nodes[u_id_str]['dt_online'] = np.array(df_online)

        for id_str_i in all_user_id_str_list:
            for id_str_j in all_user_id_str_list:
                if 'dt_online' not in G.nodes[id_str_i]: continue
                if id_str_i == id_str_j: continue

                if 'dt_online' not in G.nodes[id_str_j]: continue

                dt_i = G.nodes[id_str_i]['dt_online']
                dt_j = G.nodes[id_str_j]['dt_online']

                diff_dt = dt_i[:, np.newaxis] - dt_j
                avg_num_online = np.sum(np.exp(-diff_dt ** 2 / sigma))

                if avg_num_online >= 1:
                    G.add_edge(id_str_i, id_str_j, avg_vis_tweets=avg_num_online)

        # %% Compute retweet, qtweets interactions

        for u_id_str, df_a in df_wall.groupby('user_id_str'):
            for i in ['qtweet', 'retweet']:
                df_i = df_a[df_a.type == i]
                for u_i, df_u_i in df_i.groupby('tweet_user_id_str'):
                    if u_i not in all_user_id_str_list: continue
                    my_dict = {i: len(df_u_i)}
                    G.add_edge(u_id_str, u_i, **my_dict)

        # Qtweets, retweets and answers for the seeker
        if len(df_t_seeker) > 0:
            for i in ['qtweet', 'retweet', 'answer']:

                df_i = df_t_seeker[df_t_seeker.type == i]
                for u_i, df_u_i in df_i.groupby('tweet_user_id_str'):
                    if u_i not in all_user_id_str_list: continue
                    my_dict = {i: len(df_u_i)}
                    G.add_edge(seeker['id_str'], u_i, **my_dict)

        # Likes for the seeker
        if len(df_l_seeker) > 0:
            for u_i, df_u_i in df_l_seeker.groupby('tweet_user_id_str'):
                if u_i not in all_user_id_str_list: continue
                my_dict = {'like': len(df_u_i)}
                G.add_edge(seeker['id_str'], u_i, **my_dict)
        # %% Add node attributtres

        for user_id_str, df_u in df_wall.groupby('tweet_user_id_str'):
            G.nodes[user_id_str]['rate_total'] = len(df_u) / T

        df_users['type'] = 'User'

        df_users.loc[df_users.listed_count >= L, 'type'] = 'Expert'


        df_users.loc[df_users.id_str.isin(members), 'type'] = 'Member'

        for _, u in df_users.iterrows():
            u_dict = u.to_dict()

            del u_dict['id']
            del u_dict['name']
            del u_dict['screen_name']
            del u_dict['description']
            del u_dict['url']
            del u_dict['protected']
            del u_dict['created_at']
            del u_dict['geo_enabled']
            del u_dict['profile_image_url']

            G.add_node(u['id_str'], **u_dict)

        G.nodes[seeker['id_str']]['type'] = 'Seeker'
        G.nodes[seeker['id_str']]['rate_total'] = num_posts_seeker / T

        # %% Add node attributes based on wall. Rate (events/tweet) of emojis, url, ...

        all_nodes = list(G.nodes())
        for u_id_str, df_u in df_wall.groupby('tweet_user_id_str'):
            if u_id_str not in all_nodes: continue
            G.nodes[u_id_str]['emojis_rate'] = np.sum(df_u['emojis']) / len(df_u)
            G.nodes[u_id_str]['hashtags_rate'] = np.sum(df_u['hashtags']) / len(df_u)
            G.nodes[u_id_str]['urls_rate'] = np.sum(df_u['urls']) / len(df_u)
            G.nodes[u_id_str]['mentions_rate'] = np.sum(df_u['mentions']) / len(df_u)
            docx = nt.TextFrame(text=' '.join(list(df_u['text_processed'].values)))

            G.nodes[u_id_str]['bow'] = docx.bow()

        df_u = pd.concat([df_l_seeker, df_t_seeker])
        if len(df_u) > 0:
            G.nodes[seeker['id_str']]['emojis_rate'] = np.sum(df_u['emojis']) / len(df_u)
            G.nodes[seeker['id_str']]['hashtags_rate'] = np.sum(df_u['hashtags']) / len(df_u)
            G.nodes[seeker['id_str']]['urls_rate'] = np.sum(df_u['urls']) / len(df_u)
            G.nodes[seeker['id_str']]['mentions_rate'] = np.sum(df_u['mentions']) / len(df_u)
            docx = nt.TextFrame(text=' '.join(list(df_u['text_processed'].values)))

            G.nodes[seeker['id_str']]['bow'] = docx.bow()

        utools.save_obj(G_file, G)

        return G, False
Ejemplo n.º 5
0
def main():
    st.set_page_config(page_title="20 in 1 NLP tasks",layout='wide')
    #st.title('NER recognition app')
    options=['Home','Analysis','Custom text cleaning','Question and answering','Text summarization','Email extractor','Spelling correction',
             'Text generation','About']
    choice=st.sidebar.selectbox('Chose accordingly',options)


    if choice=='Home':
        image=Image.open('1_a3xerDP7jqQglKxfIxfxVw.jpeg')
        st.image(image)
        st.header('Multi **NLP** tasks in a single window')
        st.write("""
        # This web App contains different text analysis with visual representation and advance tasks like QnA and Text Generation
        """)




    elif choice=='Analysis':
        st.subheader('Upload document')

        doc_file = st.file_uploader('', type=['csv', 'pdf', 'text', 'docx'])

        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)

            else:
                raw_text = docx2txt.process(doc_file)

        elif doc_file is None:
            st.subheader('Or enter your input')
            raw_text = st.text_area(' ')

        
        if st.sidebar.checkbox('Analyze'):
            num_of_most_common=st.sidebar.number_input('Most common tokens',5,15)
            with st.beta_expander('Original text'):
                st.write(raw_text)

            with st.beta_expander('Basic Text Analysis'):
                data=text_analyzer(raw_text)
                st.dataframe(data)


            col1,col2=st.beta_columns(2)

            with col1:
                with st.beta_expander('Word Stats'):
                    st.info('Words statistics')
                    doc=nt.TextFrame(raw_text)
                    st.write(doc.word_stats())
                with st.beta_expander("Top Keywords"):
                    st.info("Top Keywords/Tokens")
                    processed_text = nfx.remove_stopwords(raw_text)
                    keywords = get_most_common_tokens(
                        processed_text, num_of_most_common
                    )
                    st.write(keywords)

                with st.beta_expander("Sentiment"):
                    sent_result = sentiment(raw_text)
                    st.write(sent_result)

            with col2:
                with st.beta_expander("Plot Word Freq"):
                    fig = plt.figure()
                    top_keywords = get_most_common_tokens(
                        processed_text, num_of_most_common
                    )
                    plt.bar(keywords.keys(), top_keywords.values())
                    plt.xticks(rotation=45)
                    st.pyplot(fig)

                with st.beta_expander('Plot of part of speech'):
                    fig=plt.figure()
                    sns.countplot(data['PoS'])
                    plt.xticks(rotation=45)
                    st.pyplot(fig)
                with st.beta_expander('Word Cloud Visualization'):
                    plot_wordcloud(raw_text)

        if st.sidebar.checkbox('Name Entity Recognition'):
            doc = nlp(raw_text)
            spacy_streamlit.visualize_ner(doc, labels=nlp.get_pipe('ner').labels,
                                          attrs=['text', 'label_', 'start', 'end'])




    elif choice=='Custom text cleaning':
        st.subheader('Custom text cleaning')
        doc_file = st.file_uploader('', type=['csv', 'pdf', 'text', 'docx'])

        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)

            else:
                raw_text = docx2txt.process(doc_file)

        elif doc_file is None:
            st.subheader('Or enter your input')
            raw_text = st.text_area(' ')

        normalization = st.sidebar.checkbox('Text normalization')
        clean_stopwards = st.sidebar.checkbox('Remove stopwords')
        clean_punctuation = st.sidebar.checkbox('Remove punctuation')
        clean_numreric = st.sidebar.checkbox('Remove numbers')
        clean_special = st.sidebar.checkbox('Remove special characters')
        clean_url = st.sidebar.checkbox('Clean URLs')

        if st.button('Start process'):



            col1,col2=st.beta_columns(2)
            with col1:
                with st.beta_expander('Original text'):
                    st.write('The length is :',len(raw_text))
                    st.write(raw_text)

            with col2:
                with st.beta_expander('Processed text'):
                    if normalization:
                        raw_text=raw_text.lower()
                    if clean_stopwards:
                        raw_text=nfx.remove_stopwords(raw_text)
                    if clean_url:
                        raw_text=nfx.remove_urls(raw_text)
                    if clean_special:
                        raw_text=nfx.remove_special_characters(raw_text)
                    if clean_punctuation:
                        raw_text=nfx.remove_punctuations(raw_text)
                    if clean_numreric:
                        raw_text=nfx.remove_numbers(raw_text)
                    st.write('The length is :',len(raw_text))
                    st.write(raw_text)




    elif choice=='Text summarization':
        st.subheader('Extractive text summarization')
        doc_file = st.file_uploader('Upload', type=['csv', 'pdf', 'text', 'docx'])
        #

        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)
            else:
                raw_text = docx2txt.process(doc_file)
        elif doc_file is None:
            raw_text = st.text_area('Or enter your input manually')

        if st.button("Summarize"):
            with st.beta_expander("Original Text"):
                st.write(raw_text)
            c1, c2 = st.beta_columns(2)

            with c1:
                with st.beta_expander("LexRank Summary"):
                    my_summary = sumy_summarizer(raw_text)
                    document_len = {"Original": len(raw_text),
                                    "Summary": len(my_summary)}
                    st.write(document_len)
                    st.write(my_summary)

                    st.info("Rouge Score")
                    eval_df = evaluate_summary(my_summary, raw_text)
                    #st.dataframe(eval_df.T)
                    eval_df['metrics'] = eval_df.index
                    c = alt.Chart(eval_df).mark_bar().encode(
                        x='metrics', y='rouge-1')
                    st.altair_chart(c)

            with c2:
                with st.beta_expander("Frequency based summary"):
                    summary=freq_summarization(raw_text)
                    document_len = {"Original": len(raw_text),
                                    "Summary": len(summary)}
                    st.write(document_len)
                    st.write(summary)
                    st.info("Rouge Score")
                    eval_df = evaluate_summary(summary, raw_text)
                    #st.dataframe(eval_df.T)
                    eval_df['metrics'] = eval_df.index
                    c = alt.Chart(eval_df).mark_bar().encode(
                        x='metrics', y='rouge-1')
                    st.altair_chart(c)




    # elif choice=='Document similarity':
    #     st.subheader('Document similarity check')

    #     doc_file_1 = st.file_uploader('Upload first document', type=['csv', 'pdf', 'text', 'docx'])
    #     if doc_file_1 is not None:
    #         file_details = doc_file_1.type
    #         if file_details == 'text/plain':
    #             raw_text_1 = str(doc_file_1.read(), 'utf-8')
    #         elif file_details == 'application/pdf':
    #             raw_text_1 = read_pdf(doc_file_1)
    #         else:
    #             raw_text_1 = docx2txt.process(doc_file_1)
    #     elif doc_file_1 is None:
    #         raw_text_1 = st.text_area('Upload first document manually')

    #     doc_file_2 = st.file_uploader('Upload second document', type=['csv', 'pdf', 'text', 'docx'])
    #     if doc_file_1 is not None:
    #         file_details = doc_file_2.type
    #         if file_details == 'text/plain':
    #             raw_text_2 = str(doc_file_2.read(), 'utf-8')
    #         elif file_details == 'application/pdf':
    #             raw_text_2 = read_pdf(doc_file_2)
    #         else:
    #             raw_text_2 = docx2txt.process(doc_file_2)
    #     elif doc_file_2 is None:
    #         raw_text_2 = st.text_area('Upload second document manually')

    #     a=embed_fn([raw_text_1])
    #     b=embed_fn([raw_text_2])
    #     cosine=cosine_similarity(a,b)[0][0]*100
    #     if st.button('Calculate similarity'):
    #         st.write(f'The similarity is {round(cosine,2)} %')




    elif choice=='Email extractor':
        st.subheader('Email extractor')
        doc_file = st.file_uploader('Upload', type=['csv', 'pdf', 'text', 'docx'])
        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            else:
                raw_text = docx2txt.process(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
        elif doc_file is None:
            raw_text = st.text_area('Enter your input')


        tasks_list = ["Emails"]
        task_option = st.sidebar.multiselect("Task", tasks_list, default="Emails")
        task_mapper = {"Emails": nfx.extract_emails(raw_text)}

        all_results = []
        for task in task_option:
            result = task_mapper[task]
            # st.write(result)
            all_results.append(result)
        st.write(all_results)

        with st.beta_expander("Results As DataFrame"):
            result_df = pd.DataFrame(all_results).T
            result_df.columns = task_option
            st.dataframe(result_df)
            #make_downloadable_df(result_df)

    elif choice=='Spelling correction':
        st.subheader('Spell checker and corrector')
        doc_file = st.file_uploader('Upload', type=['csv', 'pdf', 'text', 'docx'])
        if doc_file is not None:
            file_details = doc_file.type
            if file_details == 'text/plain':
                raw_text = str(doc_file.read(), 'utf-8')
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            elif file_details == 'application/pdf':
                raw_text = read_pdf(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            else:
                raw_text = docx2txt.process(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
        elif doc_file is None:
            raw_text = st.text_area('Enter your input')

        spell = SpellChecker()
        misspelled_word_list = raw_text.split()
        misspelled_word = spell.unknown(misspelled_word_list)
        b = spell.correction(raw_text)
        if st.button('Get corrected output'):
            st.write(b)
        if st.button('Analyze'):
            for word in misspelled_word:
                if word != spell.correction(word):
                    st.write('Original word:', word)
                    st.write('correct word:', spell.correction(word))
                    st.write('Suggested words:', spell.candidates(word))
                    #st.write('\n')





    elif choice=='Question and answering':
        st.subheader('Question and Answering system')

        doc_file=st.file_uploader('Upload',type=['csv','pdf','text','docx'])
        #


        if doc_file is not None:
            file_details=doc_file.type
            if file_details=='text/plain':
                raw_text=str(doc_file.read(),'utf-8')
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            elif file_details=='application/pdf':
                raw_text=read_pdf(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
            else:
                raw_text=docx2txt.process(doc_file)
                if st.checkbox('Display original text'):
                    st.write(raw_text)
        elif doc_file is None:
            raw_text = st.text_area('Enter your input')

        st.subheader('Enter your question')
        question=st.text_area('What"s in your mind?')


        # if st.button('Generate answer'):
        #
        #     qna=QnA(question,raw_text)
        #     st.write(qna)

    elif choice=='Text generationText generation':
        pass

    else:
        st.header('About')
        st.write('''
        # This web application is built by *Arindam Mondal* , a student of Masters in Data Analytics.''')
Ejemplo n.º 6
0
def main():
	"""A Simple Summarization NLP App"""
	st.title("NLP App with Streamlit")
	menu = ['Home', 'NLP(files)','About']
	choice = st.sidebar.selectbox("Menu",menu)
	if choice == 'Home':
		st.subheader("Home: Analyse Text")
		raw_text = st.text_area("Enter Text Here")
		num_of_most_common = st.sidebar.number_input("Most Common Tokens",5,15)
		if st.button("Analyze"):
			
			with st.beta_expander("Original Text"):
				st.write(raw_text)
			
			with st.beta_expander("Text Analysis"):
				token_result_df = text_analyzer(raw_text)
				st.dataframe(token_result_df)
			
			with st.beta_expander("Entities"):
				# entity_result = get_entities(raw_text)
				# st.write(entity_result)

				entity_result = render_entities(raw_text)
				# st.write(entity_result)
				stc.html(entity_result, height=1000, scrolling=True)

			# Layouts
			col1, col2 = st.beta_columns(2)

			with col1:
				with st.beta_expander("Word Stats"):
					st.info("Word Statistics")
					docx = nt.TextFrame(raw_text)
					st.write(docx.word_stats())

				with st.beta_expander("Top Keywords"):
					st.info("Top Keywords/Tokens")
					processed_text = nfx.remove_stopwords(raw_text)
					keywords = get_most_common_tokens(processed_text, num_of_most_common)
					st.write(keywords)

				with st.beta_expander("Sentiment"):
					sent_result = get_sentiment(raw_text)
					st.write(sent_result)

			with col2:
				with st.beta_expander("Plot Word Freq"):
					fig = plt.figure()
					# sns.countplot(token_result_df['Token'])
					
					top_keywords = get_most_common_tokens(processed_text, num_of_most_common)
					plt.bar(keywords.keys(),top_keywords.values())
					plt.xticks(rotation=45)
					st.pyplot(fig)


				with st.beta_expander("Plot Part of Speech"):
					fig = plt.figure()
					sns.countplot(token_result_df['PoS'])
					plt.xticks(rotation=45)
					st.pyplot(fig)

				with st.beta_expander("Plot Wordcloud"):
					plot_wordcloud(raw_text)
				
			with st.beta_expander("Download Text Analysis Result"):
				make_downloadable(token_result_df)





	elif choice == 'NLP(files)':
		st.subheader("NLP Task")

		text_file = st.file_uploader("Upload Files",type=['pdf','docx','txt'])
		num_of_most_common = st.sidebar.number_input("Most Common Tokens",5,15)
		
		if text_file is not None:
			if text_file.type == 'application/pdf':
				raw_text = read_pdf(text_file)
				# st.write(raw_text)
			elif text_file.type == 'text/plain':
				# st.write(text_file.read()) # read as bytes
				raw_text = str(text_file.read(),"utf-8")
				# st.write(raw_text)
			else:
				raw_text = docx2txt.process(text_file)
				# st.write(raw_text)

			with st.beta_expander("Original Text"):
				st.write(raw_text)
			
			with st.beta_expander("Text Analysis"):
				token_result_df = text_analyzer(raw_text)
				st.dataframe(token_result_df)
			
			with st.beta_expander("Entities"):
				# entity_result = get_entities(raw_text)
				# st.write(entity_result)

				entity_result = render_entities(raw_text)
				# st.write(entity_result)
				stc.html(entity_result, height=1000, scrolling=True)

			# Layouts
			col1, col2 = st.beta_columns(2)

			with col1:
				with st.beta_expander("Word Stats"):
					st.info("Word Statistics")
					docx = nt.TextFrame(raw_text)
					st.write(docx.word_stats())

				with st.beta_expander("Top Keywords"):
					st.info("Top Keywords/Tokens")
					processed_text = nfx.remove_stopwords(raw_text)
					keywords = get_most_common_tokens(processed_text, num_of_most_common)
					st.write(keywords)

				with st.beta_expander("Sentiment"):
					sent_result = get_sentiment(raw_text)
					st.write(sent_result)

			with col2:
				with st.beta_expander("Plot Word Freq"):
					fig = plt.figure()
					# sns.countplot(token_result_df['Token'])
					
					top_keywords = get_most_common_tokens(processed_text, num_of_most_common)
					plt.bar(keywords.keys(),top_keywords.values())
					plt.xticks(rotation=45)
					st.pyplot(fig)


				with st.beta_expander("Plot Part of Speech"):
					try:	
						fig = plt.figure()
						sns.countplot(token_result_df['PoS'])
						plt.xticks(rotation=45)
						st.pyplot(fig)
					except:
						st.warning("Insufficient Data")

				with st.beta_expander("Plot Wordcloud"):
					plot_wordcloud(raw_text)
				
			with st.beta_expander("Download Text Analysis Result"):
				make_downloadable(token_result_df)




	else:
		st.subheader("About")
Ejemplo n.º 7
0
def main():
    st.title("Text Analysis NLP _Beta v1.0")
    menu = ["Home", "Upload", "About"]

    choice = st.sidebar.selectbox("NLP Menu", menu)
    if choice == "Home":
        st.write(
            "Our day to day language can tell you an aboard patterns, insights and sentiments. Explore the power of Ai: Natural Language Processing algorithm and discover synchronicity that leads one to another. Free to use as much as you like! under GNU General Public License with a Motto #WeRiseByLiftingOthers"
        )
        st.write(
            "ML Analytics[@heroku streamlit-roy](https://streamlit-roy.herokuapp.com/)  Sample Dataset [@rupak-roy Github](https://github.com/rupak-roy/dataset-streamlit) "
        )
        st.write(
            "V3 update: Deep Learning module at [@share.streamlit.io] (https://share.streamlit.io/rupak-roy/streamlit_deeplearning_analytics/main/ML.py)"
        )

        raw_text = st.text_area("Enter Text Here")
        num_of_most_common = st.sidebar.number_input("Min Common Keywords", 5,
                                                     15)
        if st.button("Analyze"):

            #       with st.beta_expander("Original Text"):
            #          st.write(raw_text)
            with st.beta_expander("Text Analysis"):
                token_result_df = text_analyzer(raw_text)
                st.dataframe(token_result_df)

            with st.beta_expander("Entities Explorer"):
                # entity_result = get_entities(raw_text)
                # st.write(entity_result)

                entity_result = render_entities(raw_text)
                stc.html(entity_result, height=300, scrolling=True)

            with st.beta_expander("Summary using LexRank Approach"):
                st.text(
                    "Disclaimer: LexRank is an unsupervised approach to text summarization based on graph-based centrality scoring of sentences. The main idea is that sentences “recommend” other similar sentences to the reader. Thus, if one sentence is very similar to many others, it will likely be a sentence of great importance. The importance of this sentence also stems from the importance of the sentences “recommending” it. Thus, to get ranked highly and placed in a summary, a sentence must be similar to many sentences that are in turn also similar to many other sentences. This makes intuitive sense and allows the algorithms to be applied to any arbitrary new text."
                )
                my_summary = sumy_summarizer(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)

                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df.T)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            with st.beta_expander("Summary using TextRank Approach"):
                st.text(
                    "Note: One of the famous Text Summarization algorithm gets its name from Larry Page, one of the co-founders of Google."
                )
                my_summary = summarize(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)
                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            # Layouts
            col1, col2 = st.beta_columns(2)

            with col1:

                with st.beta_expander("Word Statistics"):
                    st.info("Word Statistics")
                    docx = nt.TextFrame(raw_text)
                    st.write(docx.word_stats())

                with st.beta_expander("Top Keywords/Tokens"):
                    st.info("Top Keywords/Tokens")
                    processed_text = nfx.remove_stopwords(raw_text)
                    keywords = get_most_common_tokens(processed_text,
                                                      num_of_most_common)
                    st.write(keywords)

                with st.beta_expander("Sentiment Explorer"):
                    st.info("Sentiment Analysis")
                    sent_result = get_sentiment(raw_text)
                    st.write(sent_result)

            with col2:

                with st.beta_expander("Word Frequency Graph"):
                    fig = plt.figure()
                    top_keywords = get_most_common_tokens(
                        processed_text, num_of_most_common)
                    plt.bar(keywords.keys(), top_keywords.values())
                    plt.xticks(rotation=45)
                    st.pyplot(fig)

                with st.beta_expander("Part of Speech(PoS) Graph"):
                    try:
                        fig = plt.figure()
                        sns.countplot(token_result_df["PoS"])
                        plt.xticks(rotation=45)
                        st.pyplot(fig)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Plot Wordcloud"):
                    try:
                        plot_wordcloud(raw_text)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Stylography Explorer"):
                    st.info("using Mendelhall Curve")
                    plot_mendelhall_curve_2(raw_text)

            with st.beta_expander("Download The Analysis Report"):
                make_downloadable(token_result_df)

    elif choice == "Upload":
        st.write(
            "Our day to day language can tell you an aboard patterns, insights and sentiments. Explore the prower of Ai: Natural Language Processing algorithim and discover synchronicity that leads one to another. Free to use as much as you like! under GNU General Public License with a Motto #WeRiseByLiftingOthers"
        )
        st.write(
            "ML Analytics[@heroku streamlit-roy](https://streamlit-roy.herokuapp.com/)  Sample Dataset [@rupak-roy Github](https://github.com/rupak-roy/dataset-streamlit) "
        )
        st.write(
            "V3 update: Deep Learning module at [@share.streamlit.io] (https://share.streamlit.io/rupak-roy/streamlit_deeplearning_analytics/main/ML.py)"
        )

        text_file = st.file_uploader("Upload Files",
                                     type=["pdf", "docx", "txt"])
        num_of_most_common = st.sidebar.number_input("Min Common Keywords", 5,
                                                     15)

        if text_file is not None:
            if text_file.type == "application/pdf":
                raw_text = read_pdf(text_file)
                # st.write(raw_text)
            elif text_file.type == "text/plain":
                # st.write(text_file.read()) # read as bytes
                raw_text = str(text_file.read(), "utf-8")
                # st.write(raw_text)
            else:
                raw_text = docx2txt.process(text_file)
                # st.write(raw_text)

            with st.beta_expander("Original Text"):
                st.write(raw_text)

            with st.beta_expander("Text Analysis"):
                token_result_df = text_analyzer(raw_text)
                st.dataframe(token_result_df)

            with st.beta_expander("Entities Explorer"):
                # entity_result = get_entities(raw_text)
                # st.write(entity_result)

                entity_result = render_entities(raw_text)
                stc.html(entity_result, height=300, scrolling=True)

            with st.beta_expander("Summary using LexRank Approach"):
                st.text(
                    "Disclaimer: LexRank is an unsupervised approach to text summarization based on graph-based centrality scoring of sentences. The main idea is that sentences “recommend” other similar sentences to the reader. Thus, if one sentence is very similar to many others, it will likely be a sentence of great importance. The importance of this sentence also stems from the importance of the sentences “recommending” it. Thus, to get ranked highly and placed in a summary, a sentence must be similar to many sentences that are in turn also similar to many other sentences. This makes intuitive sense and allows the algorithms to be applied to any arbitrary new text."
                )
                my_summary = sumy_summarizer(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)

                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df.T)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            with st.beta_expander("Summary using TextRank Approach"):
                st.text(
                    "Note: One of the famous Text Summarization algorithm gets its name from Larry Page, one of the co-founders of Google."
                )
                my_summary = summarize(raw_text)
                document_len = {
                    "Original": len(raw_text),
                    "Summary": len(my_summary)
                }
                st.write(document_len)
                st.write(my_summary)

                st.info(
                    "Rouge Score: F-Score:The Higher the Better the results, R-Recall/Sensitivty: refers correctly predicted positive observations to the all observations was actually positive, P - Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Ranges from 0-1 The higher the score the better the model is."
                )
                eval_df = evaluate_summary(my_summary, raw_text)
                st.dataframe(eval_df)
                eval_df['metrics'] = eval_df.index
                c = alt.Chart(eval_df).mark_bar().encode(x='metrics',
                                                         y='rouge-1')
                st.altair_chart(c)

            # Layouts
            col1, col2 = st.beta_columns(2)

            with col1:
                with st.beta_expander("Word Statistics"):
                    st.info("Word Statistics")
                    docx = nt.TextFrame(raw_text)
                    st.write(docx.word_stats())

                with st.beta_expander("Top Keywords/Tokens"):
                    st.info("Top Keywords/Tokens")
                    processed_text = nfx.remove_stopwords(raw_text)
                    keywords = get_most_common_tokens(processed_text,
                                                      num_of_most_common)
                    st.write(keywords)

                with st.beta_expander("Sentiment Explorer"):
                    st.info("Sentiment Analysis")
                    sent_result = get_sentiment(raw_text)
                    st.write(sent_result)

            with col2:
                with st.beta_expander("Word Frequency Graph"):
                    fig = plt.figure()
                    top_keywords = get_most_common_tokens(
                        processed_text, num_of_most_common)
                    plt.bar(keywords.keys(), top_keywords.values())
                    plt.xticks(rotation=45)
                    st.pyplot(fig)

                with st.beta_expander("Part of Speech(Pos) Graph"):
                    try:

                        fig = plt.figure()
                        sns.countplot(token_result_df["PoS"])
                        plt.xticks(rotation=45)
                        st.pyplot(fig)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Plot Wordcloud"):
                    try:
                        plot_wordcloud(raw_text)
                    except:
                        st.warning("Error: Insufficient Data")

                with st.beta_expander("Stylography Explorer"):
                    st.info("using Mendelhall Curve")
                    plot_mendelhall_curve_2(raw_text)

            if st.sidebar.checkbox("Top Keywords/Tokens"):
                st.info("Top Keywords/Tokens")
                processed_text = nfx.remove_stopwords(raw_text)
                keywords = get_most_common_tokens(processed_text,
                                                  num_of_most_common)
                st.write(keywords)

            if st.sidebar.checkbox("Part of Speech(Pos) Graph"):
                fig = plt.figure()
                sns.countplot(token_result_df["PoS"])
                plt.xticks(rotation=45)
                st.pyplot(fig)

            if st.sidebar.checkbox("Sentiment Analysis"):
                st.info("Sentiment Analysis")
                sent_result = get_sentiment(raw_text)
                st.write(sent_result)

            if st.sidebar.checkbox("Stylography Analysis"):
                st.info("using Mendelhall Curve")
                plot_mendelhall_curve_2(raw_text)

            if st.sidebar.checkbox("Plot Word Frequency Graph"):
                fig = plt.figure()
                top_keywords = get_most_common_tokens(processed_text,
                                                      num_of_most_common)
                plt.bar(keywords.keys(), top_keywords.values())
                plt.xticks(rotation=45)
                st.pyplot(fig)

            if st.sidebar.checkbox("Plot WordCloud"):
                plot_wordcloud(raw_text)

            with st.beta_expander("Download The Analysis Report"):
                make_downloadable(token_result_df)

    else:
        st.subheader("About")
        st.text("Thank you for your time")

        st.markdown("""
Hi I’m Bob aka. Rupak Roy. Things i write about frequently on Quora & Linkedin: analytics For Beginners, Data Science, Machine Learning, Deep learning, Natural Language Processing (NLP), Computer Vision, Big Data Technologies, Internet Of Thins and many other random topics of interest.
I formerly Co-founded various Ai based projects to inspire and nurture the human spirit with the Ai training on how to leverage on how to leverage Ai to solve problems for an exponential growth.

My Career Contour consists of various technologies starting from Masters of Science in Information Technology to Commerce with the privilege to be Wiley certified in various Analytical Domain. My alternative internet presences, Facebook, Blogger, Linkedin, Medium, Instagram, ISSUU and with Data2Dimensions
If you wish to learn more about Data Science follow me at:

~ Medium [@rupak.roy](https://medium.com/@rupak.roy)

~ Linkedin [@bobrupak](https://www.linkedin.com/in/bobrupak/)

My Fav. Quote:

Millions saw the apple fall but only Newton asked why! ~ “Curiosity is the spark of perfection and innovations. So connect with data and discover sync“
""")
        st.image('img/prism.gif')
        with st.beta_expander("Suprise!"):
            st.title(
                "COLLECT YOUR FULL VERSION MACHINE LEARNING APP @ ping_me #socialmedia"
            )
            st.image('img/office.jpg')
            st.info("")
            st.success("")
            st.warning("")
            st.error("")
Ejemplo n.º 8
0
def main():
    """NLP App with Streamlit and TextBlob"""

    #st.title("NLP Simple Examples")

    title_templ = """
    <div style="background-color:blue;padding:8px;">
    <h1 style="color:cyan">NLP Simple Examples</h1>
    </div>
    """

    st.markdown(title_templ,unsafe_allow_html=True)

    subheader_templ = """
    <div style="background-color:cyan;padding:8px;">
    <h3 style="color:blue">Natural Language Processing On the Go...</h3>
    </div>
    """

    st.markdown(subheader_templ,unsafe_allow_html=True)

    st.sidebar.image("https://www.centreofexcellence.com/app/uploads/2016/09/nlp-diploma-course.jpg", use_column_width=True)

    activity = ["Text Analysis", "Translation", "Sentiment Analysis", "About"]
    choice = st.sidebar.selectbox("Menu",activity)



	# Text Analysis CHOICE
    if choice == 'Text Analysis':

        st.subheader("Text Analysis")        
        st.write("")
        st.write("")

        raw_text = st.text_area("Write something","Enter a Text in English...",height=250)

        if st.button("Analyze"):
            if len(raw_text) == 0:
            	st.warning("Enter a Text...")
            else:
            	blob = TextBlob(raw_text)
            	st.write("")

            	if blob.detect_language() != 'en':
            		st.warning("Enter a Text in English...")
            	else:
            		st.info("Basic Functions")
            		col1, col2 = st.beta_columns(2)

            		with col1:
            			with st.beta_expander("Basic Info"):
            				st.success("Text Stats")
            				word_desc = nt.TextFrame(raw_text).word_stats()
            				result_desc = {"Length of Text":word_desc['Length of Text'],
											"Num of Vowels":word_desc['Num of Vowels'],
											"Num of Consonants":word_desc['Num of Consonants'],
											"Num of Stopwords":word_desc['Num of Stopwords']}
            				st.write(result_desc)

            			with st.beta_expander("Stopwords"):
            				st.success("Stop Words List")
            				stop_w = nt.TextExtractor(raw_text).extract_stopwords()
            				st.error(stop_w)

            		with col2:
            			with st.beta_expander("Processed Text"):
            				st.success("Stopwords Excluded Text")
            				processed_text = str(nt.TextFrame(raw_text).remove_stopwords())
            				st.write(processed_text)

            			with st.beta_expander("Plot Wordcloud"):
            			    st.success("Wordcloud")
            			    plot_wordcloud(raw_text)



            		st.write("")
            		st.write("")
            		st.info("Advanced Features")
            		col3, col4 = st.beta_columns(2)

            		with col3:
            			with st.beta_expander("Tokens&Lemmas"):
            				st.write("T&L")
            				processed_text_mid = str(nt.TextFrame(raw_text).remove_stopwords())
            				processed_text_mid = str(nt.TextFrame(processed_text_mid).remove_puncts())
            				processed_text_fin = str(nt.TextFrame(processed_text_mid).remove_special_characters())
            				tandl = text_analyzer(processed_text_fin)
            				st.json(tandl)

            		with col4:
            			with st.beta_expander("Summarize"):
            				st.success("Summarize")
            				summary_text = summarize(raw_text,ratio=0.4)
            				if summary_text != "":
            					st.success(summary_text)
            				else:
            					st.warning("Please insert a Longer Text")


        


    # Translation CHOICE
    elif choice == 'Translation':

        st.subheader("Text Translation")

        st.write("")
        st.write("")
        raw_text = st.text_area("","Write something to be translated...")
        if len(raw_text) < 3:
            st.warning("Please provide a string with at least 3 characters...")
        else:
            blob = TextBlob(raw_text)
            lang = blob.detect_language()
            #st.write(lang)
            tran_options = st.selectbox("Select translation language",['Chinese', 'English', 'German', 'Italian', 'Russian', 'Spanish'])
            if st.button("Translate"):
                if tran_options == 'Italian' and lang != 'it':
                    st.text("Translating to Italian...")
                    tran_result = blob.translate(from_lang=lang, to='it')
                elif tran_options == 'Spanish' and lang != 'es':
                    st.text("Translating to Spanish...")
                    tran_result = blob.translate(from_lang=lang, to='es')
                elif tran_options == 'Chinese' and lang != 'zh-CN':
                    st.text("Translating to Chinese...")
                    tran_result = blob.translate(from_lang=lang, to='zh-CN')
                elif tran_options == 'Russian' and lang != 'ru':
                    st.text("Translating to Russian...")
                    tran_result = blob.translate(from_lang=lang, to='ru')
                elif tran_options == 'German' and lang != 'de':
                    st.text("Translating to German...")
                    tran_result = blob.translate(from_lang=lang, to='de')
                elif tran_options == 'English' and lang != 'en':
                    st.text("Translating to English...")
                    tran_result = blob.translate(from_lang=lang, to='en')
                else:
                    tran_result = "Text is already in " + "'" + lang + "'"


                st.success(tran_result)
            
        
    

    # Sentiment Analysis CHOICE
    elif choice == 'Sentiment Analysis':
        
        st.subheader("Sentiment Analysis")

        st.write("")
        st.write("")

        raw_text = st.text_area("", "Enter a Text...")

        if st.button("Evaluate"):
            if len(raw_text) == 0:
                st.warning("Enter a Text...")
            else:
                blob = TextBlob(raw_text)
                lang = blob.detect_language()

                if lang != 'en':
                    tran_result = blob.translate(from_lang=lang, to='en')
                    blob = TextBlob(str(tran_result))

                result_sentiment = blob.sentiment
                st.info("Sentiment Polarity: {}".format(result_sentiment.polarity))
                st.info("Sentiment Subjectivity: {}".format(result_sentiment.subjectivity))

        



    # About CHOICE
    else:# choice == 'About':
        st.subheader("About")

        st.write("")
        st.write("")

        st.markdown("""
        ### NLP Simple Examples (App with Streamlit and TextBlob)
        
        ##### By
        + **[Rosario Moscato LAB](https://www.youtube.com/channel/UCDn-FahQNJQOekLrOcR7-7Q)**
        + [[email protected]](mailto:[email protected])
        """)
Ejemplo n.º 9
0
def main():

	# st.title("Data Science Application Dashboard")

	menu = ['Home', 'NLP Summarization App', 'NLP Text Analysis', 'NLP Q&A App']
	choice = st.sidebar.selectbox("Menu", menu)

	# Home Page
	if choice == 'Home':
		
		st.markdown("""
						<div align='center'><h1 style="color:blue"><font size="5"> Welcome to Data Science Application Dashboard </font> </h1></div> <br>\
						<p> The purpose of this dashboard is to demonstrate few applications on natural language processing (NLP). NLP is a branch of artifical intelligence \
						that deals with the interaction between computers and humans using language. The goals for NLP is to read, understand, translate, and make sense of \
						the humana langages in a manner that is valuable. </p>

						<p> Three applications are demonstrated in the dashboard. Brief descriptions are shown in below: </p>

						<ol>
							<li> NLP Summarization App - This app will allow you to either enter an article from URL, copy and paste texts, or upload a file (.pdf, .docx, .txt). The app will apply \
							TextRank Algorithm to summarize the article and also evaluate the summary using Rouge scores. </li>
							<li>NLP Text Analysis App - This app will allow you to either enter an article from URL, copy and paste texts, or upload a file (.pdf, .docx, .txt). The app will apply \
							variety of python packages (e.g., spacy, neattext, wordcloud, etc) to analyze the article and generate word statistics, top keywords, sentiment, wordcloud, and more. </li>
							<li> NLP Q&A App - This app will allow you to either enter an article from URL or copy and paste texts. The app will read the article and you can ask any questions \
							related to the article. The app will apply cosine similarity to response few sentences that are closed related to your question. </li>
						</ol> 


						 """, unsafe_allow_html=True)



	# NLP Q&A Page
	elif choice == 'NLP Q&A App':

		st.subheader("Natural Language Processing (NLP) Q&A Application")
		file_format = ['URL','Plain Text']
		files = st.sidebar.selectbox("Input Format", file_format)

		if files == 'URL':

			text_input = st.text_area("Please Enter an URL Article")
			if text_input:

				try:

					#print(text_input)

					# Get Article
					article = Article(text_input.strip())
					article.download()
					article.parse()
					article.nlp()
					corpus  = article.text

					#print(corpus)
					#print(text)

					# Tokenization
					text = corpus
					sentence_list = nltk.sent_tokenize(text)

					# print(text)

					question = st.text_area("Please Enter a question related to the Artcile")

					if question:

						response = bot_response(user_input = question, sentence_list = sentence_list)
						st.subheader(response)

				except:

					st.warning("Please Enter a correct URL with Article")

		elif files == 'Plain Text':

			text_input = st.text_area("Please Enter Text Here")
			if text_input:

				try:

					question = st.text_area("Please Enter a question related to the Artcile")

					sentence_list = nltk.sent_tokenize(text_input)

					# print(sentence_list)

					if question:

						response = bot_response(user_input = question, sentence_list = sentence_list)
						st.subheader(response)

				except:

					st.warning("Please Enter some text")

	# NLP Summarization page
	elif choice == 'NLP Summarization App':

		st.subheader("Natural Language Processing (NLP) Summarization Application")
		file_format = ['URL','Plain Text', 'Upload a File']
		files = st.sidebar.selectbox("Input Format", file_format)

		if files == 'URL':

			text_input = st.text_area("Please Enter an URL Article")
			if st.button("Summarize"):

				try:
					# Get Article
					article = Article(text_input.strip())
					article.download()
					article.parse()
					article.nlp()
					corpus  = article.text

					#print(corpus)
					#print(text)

					# Tokenization
					text = corpus
					sentence_list = nltk.sent_tokenize(text)

					art_text = " ".join(sentence_list)

					with st.beta_expander("Original Text"):
						st.write(art_text)

					#c1, c2 = st.beta_columns(2)

					#with c1:
					#	with st.beta_expander("LexRank Summary"):
					#		pass

					with st.beta_expander("TextRank Summary"):
						textrank_sum = summarize(art_text)
						doc_length = {"Article Word Count": len(art_text), "Summary Word Count": len(textrank_sum)}
						st.write(doc_length)
						st.write(textrank_sum)


						st.info("Rouge Score")
						score = evaluate_summary(textrank_sum, art_text)
						st.dataframe(score)

				except:
					st.warning("Please Enter a correct URL with Article")

		elif files == 'Plain Text':

			text_input = st.text_area("Please Enter Text Here")
			if st.button("Summarize"):

				try:

					sentence_list = nltk.sent_tokenize(text_input)
					art_text = " ".join(sentence_list)

					with st.beta_expander("Original Text"):
						st.write(art_text)

					with st.beta_expander("TextRank Summary"):
						textrank_sum = summarize(art_text)
						doc_length = {"Article Word Count": len(art_text.split()), "Summary Word Count": len(textrank_sum.split())}
						st.write(doc_length)
						st.write(textrank_sum)


						st.info("Rouge Score")
						score = evaluate_summary(textrank_sum, art_text)
						st.dataframe(score)

				except:
					st.warning("Please Enter more sentences")

		elif files == 'Upload a File':

			text_file = st.file_uploader("Please Upload a File", type = ['pdf', 'docx', 'txt'])

			if text_file is not None:
				if text_file.type == 'application/pdf':
					text_input = read_pdf(text_file)

				elif text_file.type == 'text/plain':
					text_input = str(text_input, read(), 'utf-8')

				else:
					text_input = docx2txt.process(text_file)

				try:

					with st.beta_expander("Original Text"):
						st.write(text_input)

					with st.beta_expander("TextRank Summary"):
						textrank_sum = summarize(text_input)
						doc_length = {"Article Word Count": len(text_input.split()), "Summary Word Count": len(textrank_sum.split())}
						st.write(doc_length)
						st.write(textrank_sum)


						st.info("Rouge Score")
						score = evaluate_summary(textrank_sum, text_input)
						st.dataframe(score)

				except:
					st.warning("Please Enter more sentences")

	# NLP Text Analysis
	elif choice == 'NLP Text Analysis':

		st.subheader("Natural Language Processing (NLP) Text Analysis Application")
		file_format = ['URL','Plain Text', 'Upload a File']
		files = st.sidebar.selectbox("Input Format", file_format)

		if files == 'URL':

			text_input = st.text_area("Please Enter an URL Article")
			num_of_tokens = st.sidebar.number_input("Most Common Word", 5, 15)
			if st.button("Analyze"):

				try:
					# Get Article
					article = Article(text_input.strip())
					article.download()
					article.parse()
					article.nlp()
					corpus  = article.text

					# Tokenization
					text = corpus
					sentence_list = nltk.sent_tokenize(text)

					art_text = " ".join(sentence_list)

					# Original Text
					with st.beta_expander("Original Text"):
						st.write(art_text)

					# Text Analysis
					with st.beta_expander("Text Analysis"):
						token_df = text_analysis(art_text)
						st.dataframe(token_df)

					# Entities
					with st.beta_expander("Entities"):
						#ent_check = get_entities(art_text)
						#st.write(ent_check)

						entity_df = render_entities(art_text)
						stc.html(entity_df, height=500, scrolling=True)

					c1, c2 = st.beta_columns(2)

					
					with c1:
						# Word Statistics
						with st.beta_expander("Word Statistics"):
							st.info("Word Statistics")
							docx = nt.TextFrame(art_text)
							st.write(docx.word_stats())

						# Plot Part of Speech
						with st.beta_expander("Plot Part of Speech"):
							fig = plt.figure()
							sns.countplot(token_df['PoS'])
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Get Sentiment
						with st.beta_expander("Sentiment"):
							sent_result = get_sentiment(art_text)
							st.write(sent_result)
					
					with c2:
						# Most Common Word
						with st.beta_expander("Top Keywords"):
							st.info("Top Keywords/Tokens")
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							keyword = most_word(remove_sw, num_of_tokens)
							st.write(keyword)

						# Plot Word Freq
						with st.beta_expander("Plot Top Word Frequency"):
							fig = plt.figure()
							top_word = most_word(remove_sw, num_of_tokens)
							plt.bar(top_word.keys(), top_word.values())
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Generate WordCloud
						with st.beta_expander("Plot WordCloud"):
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							plot_wordcloud(remove_sw)

					#with st.beta_expander("Download Text Analysis Results"):
					#	make_downloadable(token_df)

				except:
					st.warning("Please Enter a correct URL with Article")

		elif files == 'Plain Text':

			text_input = st.text_area("Please Enter Text Here")
			num_of_tokens = st.sidebar.number_input("Most Common Word", 5, 15)
			if st.button("Analyze"):

				try:
					
					sentence_list = nltk.sent_tokenize(text_input)
					art_text = " ".join(sentence_list)

					# Original Text
					with st.beta_expander("Original Text"):
						st.write(art_text)

					# Text Analysis
					with st.beta_expander("Text Analysis"):
						token_df = text_analysis(art_text)
						st.dataframe(token_df)

					# Entities
					with st.beta_expander("Entities"):
						#ent_check = get_entities(art_text)
						#st.write(ent_check)

						entity_df = render_entities(art_text)
						stc.html(entity_df, height=500, scrolling=True)

					c1, c2 = st.beta_columns(2)

					
					with c1:
						# Word Statistics
						with st.beta_expander("Word Statistics"):
							st.info("Word Statistics")
							docx = nt.TextFrame(art_text)
							st.write(docx.word_stats())

						# Plot Part of Speech
						with st.beta_expander("Plot Part of Speech"):
							fig = plt.figure()
							sns.countplot(token_df['PoS'])
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Get Sentiment
						with st.beta_expander("Sentiment"):
							sent_result = get_sentiment(art_text)
							st.write(sent_result)
					
					with c2:
						# Most Common Word
						with st.beta_expander("Top Keywords"):
							st.info("Top Keywords/Tokens")
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							keyword = most_word(remove_sw, num_of_tokens)
							st.write(keyword)

						# Plot Word Freq
						with st.beta_expander("Plot Top Word Frequency"):
							fig = plt.figure()
							top_word = most_word(remove_sw, num_of_tokens)
							plt.bar(top_word.keys(), top_word.values())
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Generate WordCloud
						with st.beta_expander("Plot WordCloud"):
							lower_text = art_text.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							plot_wordcloud(remove_sw)

					#with st.beta_expander("Download Text Analysis Results"):
					#	make_downloadable(token_df)

				except:
					st.warning("Please Enter more sentences")

		elif files == 'Upload a File':
			
			text_file = st.file_uploader("Please Upload a File", type = ['pdf', 'docx', 'txt'])
			num_of_tokens = st.sidebar.number_input("Most Common Word", 5, 15)
			if text_file is not None:
				if text_file.type == 'application/pdf':
					text_input = read_pdf(text_file)

				elif text_file.type == 'text/plain':
					text_input = str(text_input, read(), 'utf-8')

				else:
					text_input = docx2txt.process(text_file)

				try:

					# Original Text
					with st.beta_expander("Original Text"):
						st.write(text_input)

					# Text Analysis
					with st.beta_expander("Text Analysis"):
						token_df = text_analysis(text_input)
						st.dataframe(token_df)

					# Entities
					with st.beta_expander("Entities"):
						#ent_check = get_entities(art_text)
						#st.write(ent_check)

						entity_df = render_entities(text_input)
						stc.html(entity_df, height=500, scrolling=True)

					c1, c2 = st.beta_columns(2)

					
					with c1:
						# Word Statistics
						with st.beta_expander("Word Statistics"):
							st.info("Word Statistics")
							docx = nt.TextFrame(text_input)
							st.write(docx.word_stats())

						# Plot Part of Speech
						with st.beta_expander("Plot Part of Speech"):
							fig = plt.figure()
							sns.countplot(token_df['PoS'])
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Get Sentiment
						with st.beta_expander("Sentiment"):
							sent_result = get_sentiment(text_input)
							st.write(sent_result)
					
					with c2:
						# Most Common Word
						with st.beta_expander("Top Keywords"):
							st.info("Top Keywords/Tokens")
							lower_text = text_input.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							keyword = most_word(remove_sw, num_of_tokens)
							st.write(keyword)

						# Plot Word Freq
						with st.beta_expander("Plot Top Word Frequency"):
							fig = plt.figure()
							top_word = most_word(remove_sw, num_of_tokens)
							plt.bar(top_word.keys(), top_word.values())
							plt.xticks(rotation=45)
							st.pyplot(fig)

						# Generate WordCloud
						with st.beta_expander("Plot WordCloud"):
							lower_text = text_input.lower()
							remove_sw = nfx.remove_stopwords(lower_text)
							plot_wordcloud(remove_sw)

				except:
					st.warning("The uploaded file does not have enough text")
Ejemplo n.º 10
0
def page_data():
    st.markdown("### Analyse du Dataset {}".format(current_dataset_name))
    st.markdown("#### Distribution")
    emotion_names = current_dataset['emotion'].value_counts().index.tolist()
    fig, ax = plt.subplots(figsize=(12, 12))
    sns.countplot(x='emotion',
                  data=current_dataset,
                  order=current_dataset.emotion.value_counts().index)
    plt.xticks(rotation=45)
    st.pyplot(fig)
    pd.DataFrame(current_dataset.emotion.value_counts()).T
    st.write(
        "Certaines émotions ne disposent pas d'assez de données et risquent de diminuer la pertinence de notre modèle."
    )

    fig, ax = plt.subplots(figsize=(12, 12))
    # dxp.count('emotion', data=current_dataset, split='sentiment', normalize='emotion')
    sns.catplot(data=current_dataset,
                x='emotion',
                hue='sentiment',
                kind='count',
                size=7,
                aspect=1.5)
    plt.xticks(rotation=45)
    st.pyplot()
    st.write(
        "Les sentiments detectés ne correspondent pas toujours aux émotions associés. Notemment les émotions négatives comme la colère, la tristesse ou l'inquiètude sont perçus aussi bien de manière positive, négative ou neutre."
    )

    st.write("Corpus Unclean")
    corpus = current_dataset.content.tolist()
    corpus = ' '.join(corpus)
    docx = nt.TextFrame(corpus)
    docx.text = corpus
    # st.write(docx.describe())
    st.write("Noise Scan")
    st.write(docx.noise_scan())
    st.write("Lexical Richness")
    st.write(docx.lexical_richness())

    st.write("Corpus Clean")
    corpus = current_dataset.clean_content.tolist()
    corpus = ' '.join(corpus)
    docx = nt.TextFrame(corpus)
    docx.text = corpus
    # st.write(docx.describe())
    st.write("Noise Scan")
    st.write(docx.noise_scan())
    st.write("Lexical Richness")
    st.write(docx.lexical_richness())

    NUM_TOP_WORDS = 20
    top_20_before = hero.visualization.top_words(
        current_dataset['content']).head(NUM_TOP_WORDS)
    top_20_after = hero.visualization.top_words(
        current_dataset['clean_content']).head(NUM_TOP_WORDS)

    st.write(current_dataset.head(10))

    fig, ax = plt.subplots(figsize=(12, 12))
    top_20_before.plot.bar(rot=90)
    ax.set_title('Top 20 words before cleaning')
    st.pyplot(fig)

    fig, ax = plt.subplots(figsize=(12, 12))
    top_20_after.plot.bar(rot=90)
    ax.set_title('Top 20 words after cleaning')
    st.pyplot(fig)

    st.markdown("#### Nuage de mots")

    for emotion in emotion_names:
        wordcloud_generator(current_dataset.query(
            "emotion == '{}'".format(emotion)).clean_content,
                            title=emotion)
        corpus = current_dataset.query(
            "emotion == '{}'".format(emotion)).clean_content.tolist()
        corpus = ' '.join(corpus)
        keywords = extract_keywords(corpus)
        plot_keywords(keywords)
Ejemplo n.º 11
0
def main():
    """Text Analysis with Streamlit and NLP"""
    st.title("Transcript Analyser with Streamlit and NLP")

    title_templ = """
    <div style="padding:8px;">
    </div>
    """
    st.markdown(title_templ, unsafe_allow_html=True)

    activity = ["Text Analysis", "About"]
    choice = st.sidebar.radio("Menu", activity)

    if choice == "Text Analysis":
        st.subheader("Text Analysis")
        st.write("")
        st.write("")

        raw_text = st.text_area("Enter your plain text here", height=300)
        customText = [
            'agent', 'customer', 'cosmo', 'agent-freehand', 'freehand', '/',
            '::', 'british', 'british gas', 'chat', 'engagement'
        ]
        final_processed_text = ''

        if st.button("Analyze"):
            col1, col2 = st.beta_columns(2)
            blob = TextBlob(raw_text)
            st.write("")

            st.info("Basic Analysis")
            col1, col2 = st.beta_columns(2)

            with col1:
                with st.beta_expander("Basic Info"):
                    st.success("Text Stats")
                    word_desc = nt.TextFrame(raw_text).word_stats()
                    result_desc = {
                        "Length of Text": word_desc['Length of Text'],
                        "Num of Vowels": word_desc['Num of Vowels'],
                        "Num of Consonants": word_desc['Num of Consonants'],
                        "Num of Stopwords": word_desc['Num of Stopwords']
                    }
                    st.write(result_desc)

                with st.beta_expander("Stopwords"):
                    st.success("Stop Words List")
                    stop_w = nt.TextExtractor(raw_text).extract_stopwords()
                    st.error(stop_w)

            with col2:
                with st.beta_expander("Processed text"):
                    st.success("Processed text")
                    processed_text = raw_text.lower()
                    processed_text = str(
                        nt.TextFrame(raw_text).remove_stopwords())
                    processed_text = nfx.clean_text(processed_text,
                                                    urls=True,
                                                    numbers=True,
                                                    puncts=True,
                                                    stopwords=True,
                                                    emails=True,
                                                    phone_num=True,
                                                    non_ascii=True,
                                                    multiple_whitespaces=True,
                                                    contractions=True)
                    processed_text = nfx.remove_shortwords(processed_text)
                    for ct in customText:
                        processed_text = nfx.clean_text(processed_text,
                                                        custom_pattern=ct)
                    final_processed_text = processed_text
                    st.write(final_processed_text)

            st.write("")
            st.write("")
            st.info("Advanced Analysis")
            col3, col4 = st.beta_columns(2)

            with col3:
                with st.beta_expander("WordCloud"):
                    st.success("WordCloud")
                    plot_wordcloud(final_processed_text)

            with col4:
                with st.beta_expander("Tokens&Lemmas"):
                    st.write("T&L")
                    processed_text_mid = str(
                        nt.TextFrame(raw_text).remove_stopwords())
                    processed_text_mid = str(
                        nt.TextFrame(processed_text_mid).remove_puncts())
                    processed_text_fin = str(
                        nt.TextFrame(
                            processed_text_mid).remove_special_characters())
                    tandl = text_analyzer(processed_text_fin)
                    st.json(tandl)

    elif choice == "About":
        st.subheader("About")

        st.write("")
        st.write("")

        st.markdown("""
        ### By
        + *Abhay Chaskar*
        + *Shiva Saran*
        + *Shraddha Gadekar*
        + *Balaleshwar Raut*
        """)
sns.countplot(df['php'])

df['php'].value_counts()

df['php'].value_counts().plot(kind='bar')
"""### Text Preprocessing
+ neattext : remove_stopwords
+ pip install neattext
"""
#pip install neattext

import neattext as nt
import neattext.functions as nfx

# Explore For Noise
df['title'].apply(lambda x: nt.TextFrame(x).noise_scan())

# Explore For Noise
df['title'].apply(lambda x: nt.TextExtractor(x).extract_stopwords())

dir(nfx)

# Explore For Noise
df['title'].apply(nfx.remove_stopwords)

corpus = df['title'].apply(nfx.remove_stopwords)
"""### Feature Engineering
+ Build features from our text
+ TFIDF,countvectorizer,bow
"""
Ejemplo n.º 13
0
df = df[['tweet', 'class']]
#print(df.head())

#print(df.iloc[4]['tweet'])

# Remove special characters, hastags, sopwords/punctations
# Methods/attrib
#print(dir(nt))
#print(dir(nfx))

s = df.iloc[4]['tweet']
#print(s)

# Method 1:Oop using Textframe
docx = nt.TextFrame(s)

docx.describe()
#print(docx.head(10))

# Remove stopwords
#print(docx.remove_stopwords().text)
# Remove punctations
#print(docx.remove_puncts().text)
#print(docx.remove_puncts(most_common=False).text)

# Method2: Using Functional Approach
#print(s)

# Remove userhandles,hastags,specialcharacters
#print(nfx.remove_userhandles(s))