def test_issue3531(): """Test that displaCy renderer doesn't require "settings" key.""" example_dep = { "words": [ {"text": "But", "tag": "CCONJ"}, {"text": "Google", "tag": "PROPN"}, {"text": "is", "tag": "VERB"}, {"text": "starting", "tag": "VERB"}, {"text": "from", "tag": "ADP"}, {"text": "behind.", "tag": "ADV"}, ], "arcs": [ {"start": 0, "end": 3, "label": "cc", "dir": "left"}, {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, {"start": 2, "end": 3, "label": "aux", "dir": "left"}, {"start": 3, "end": 4, "label": "prep", "dir": "right"}, {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, ], } example_ent = { "text": "But Google is starting from behind.", "ents": [{"start": 4, "end": 10, "label": "ORG"}], } dep_html = displacy.render(example_dep, style="dep", manual=True) assert dep_html ent_html = displacy.render(example_ent, style="ent", manual=True) assert ent_html
def test_issue3288(en_vocab): """Test that retokenization works correctly via displaCy when punctuation is merged onto the preceeding token and tensor is resized.""" words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] heads = [1, 0, -1, 1, 0, 1, -2, -3] deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) doc.tensor = numpy.zeros((len(words), 96), dtype="float32") displacy.render(doc)
def test_displacy_rtl(): # Source: http://www.sobhe.ir/hazm/ – is this correct? words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"] # These are (likely) wrong, but it's just for testing pos = ["PRO", "ADV", "N_PL", "V_SUB"] # needs to match lang.fa.tag_map deps = ["foo", "bar", "foo", "baz"] heads = [1, 0, 1, -2] nlp = Persian() doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) doc.ents = [Span(doc, 1, 3, label="TEST")] html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html assert 'direction="rtl"' in html assert 'lang="{}"'.format(nlp.lang) in html html = displacy.render(doc, page=True, style="ent") assert "direction: rtl" in html assert 'lang="{}"'.format(nlp.lang) in html
def test_issue2361(de_tokenizer): chars = ("<", ">", "&", """) doc = de_tokenizer('< > & " ') doc.is_parsed = True doc.is_tagged = True html = render(doc) for char in chars: assert char in html
def test_displacy_render_wrapper(en_vocab): """Test that displaCy accepts custom rendering wrapper.""" def wrapper(html): return "TEST" + html + "TEST" displacy.set_render_wrapper(wrapper) doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] html = displacy.render(doc, style="ent") assert html.startswith("TEST<div") assert html.endswith("/div>TEST") # Restore displacy.set_render_wrapper(lambda html: html)
def to_html(doc, output='/tmp', style='dep'): """Doc method extension for saving the current state as a displaCy visualization. """ # generate filename from first six non-punct tokens file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' html = displacy.render(doc, style=style, page=True) # render markup if output is not None: output_path = Path(output) if not output_path.exists(): output_path.mkdir() output_file = Path(output) / file_name output_file.open('w', encoding='utf-8').write(html) # save to file print('Saved HTML to {}'.format(output_file)) else: print(html)
def file_handler(image_id, part_table, part_slot, encoded_filepath, view_type): """Display page for a file system element. If the element is a directory then the page displays the directory listing as read from the disk image. If a file is selected the files contents as a binary payload is sent in the Response. """ file_path = urllib.unquote(encoded_filepath) partition = _found_or_404(Partition.by_image_table_and_slot(image_id, part_table, part_slot)) fs_ele = _found_or_404(FileSysEle.from_partition(partition, file_path)) # Check if we have a directory if fs_ele.is_directory: # Render the dir listing template return _render_directory(partition, file_path) # Its a file, we'll need a temp file to analyse or serve temp_file = FileSysEle.create_temp_copy(partition, fs_ele) # Get the byte stream object and index it. byte_sequence, full_text =\ ImageIndexer.get_path_details(temp_file, os.path.abspath(fs_ele.path)) # Build the NLP object from extracted full_text, generate entity markup full_text_nlp_obj = nlp(unicode(full_text, 'utf-8')) full_text_entity_html = displacy.render(full_text_nlp_obj, style='ent', page=False) # Check whether this path has been indexed and the results are in the DB file_element = FileElement.by_partition_and_path(partition, file_path) if file_element is None: # If not then add the path and p file_element = FileElement(file_path, partition, byte_sequence) # Is this a blob request if request_wants_binary(): return send_file(temp_file, mimetype=byte_sequence.mime_type, as_attachment=True, attachment_filename=fs_ele.name) # Return correct view depending on URL parameter if view_type == 'text-view': return render_template('text_analysis.html', image=partition.image, partition=partition, file_path=file_path, fs_ele=fs_ele, file_element=file_element, full_text=full_text) else: return render_template('entity_analysis.html', image=partition.image, partition=partition, file_path=file_path, fs_ele=fs_ele, file_element=file_element, full_text=full_text_entity_html)
def test_displacy_raises_for_wrong_type(en_vocab): with pytest.raises(ValueError): displacy.render("hello world")
d = dict(start=m.start(), end=m.end(), label="") ents.append(d) #sort the result by ents, as ent rule suggests sort_ents = sorted(ents, key=lambda x: x["start"]) st.header('Output') result_view = st.radio("Choose visualization type", ('Highlighting', 'Word cloud', 'Table'), index=0) if result_view == 'Highlighting': #use spacy to higlight the keywords ex = [{"text": text, "ents": sort_ents, "title": None}] html = displacy.render(ex, style="ent", manual=True) html = html.replace("\n", " ") st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) elif result_view == "Table": #tabular data (columns: keywords, score) df = pd.DataFrame(keywords, columns=("keywords", "score")) st.table(df) else: #create and generate a word cloud image wordcloud = WordCloud(width=1000, height=600, max_font_size=80, min_font_size=10, prefer_horizontal=1, max_words=numOfKeywords,
import spacy from spacy import displacy # Load English tokenizer, tagger, parser, NER and word vectors nlp = spacy.load("en_core_web_sm") # Process whole documents text = ("When Sebastian Thrun started working on self-driving cars at " "Google in 2007, few people outside of the company took him " "seriously. “I can tell you very senior CEOs of major American " "car companies would shake my hand and turn away because I wasn’t " "worth talking to,” said Thrun, in an interview with Recode earlier " "this week.") doc = nlp(text) displacy.render(doc, style='dep')
def main(): '''Creates a main title and subheader on your page - these are static across all pages''' st.title("Tweet Classifier") st.subheader("Climate change tweet classification") # Creating sidebar with selection box - # you can create multiple pages this way options = ["Prediction", "Natural Language Processing Tool", "Exploratory Data Analysis"] selection = st.sidebar.selectbox("Choose Option", options) ##### Building out the Prediction page #### if selection == "Prediction": st.markdown("# Machine Learning Model Predictions") st.markdown('Sentiment analysis is the classification of text in emotional categories such as positive, neutral, negative and news. The following machine learning models were built and trained to predict the emotional drive of tweets related to climate change. Please enter your text below and select a machine learning model to predict the sentiment of your text.') raw_text = st.text_area("Enter Text","Type Here") # Model Prediction #Select model all_ml_modles= ["Linear SVC","Naive Bayes", "Logistic Regression"] model_choice = st.selectbox("Select base ML model",all_ml_modles) st.markdown("#### Select 'Classify' to view the result of the model prediction") st.markdown("") prediction_labels = {'anti climate change':-1,'news':2,'pro climate change':1,'neutral':0} if st.button("Classify"): #st.text("Original Text:\n{}".format(raw_text)) vect_text = tweet_cv.transform([raw_text]).toarray() if model_choice == 'Linear SVC': predictor = joblib.load(open(os.path.join("resources/models/linsvcmodel.pkl"),"rb")) prediction = predictor.predict(vect_text) # st.write(prediction) elif model_choice == 'Naive Bayes': predictor = joblib.load(open(os.path.join("resources/models/naivebayesmodel.pkl"),"rb")) prediction = predictor.predict(vect_text) # st.write(prediction) elif model_choice == 'Logistic Regression': predictor = joblib.load(open(os.path.join("resources/models/logisticregression.pkl"),"rb")) prediction = predictor.predict(vect_text) # st.write(prediction) final_result = get_keys(prediction,prediction_labels) st.success("Tweet categorized as : {} using the {} model".format(final_result, model_choice)) ##### Building out the NLP page #### if selection == "Natural Language Processing Tool": st.markdown('# Natural Language Processing Tool') st.markdown('Natural language processing, commonly known as NLP, is a field of artificial intelligence about the interaction between computers and humans using natural language. The objective of NLP is for the computer to read, understand and derive meaning from human languages.') st.markdown('The following text processing tools can be viewed on your input text below:\n' '- **Tokenization** - Listing each word and punctuation \n' '- **Lemmatization** - Returns single base form of a word \n' '- **Named-entity recognition (NER)** - Locate and classify entities in categories such as person names and organisations\n' '- **Parts of Speech tags (POS)** - The identification of words as nouns, verbs, adjectives, etc.') nlp_text = st.text_area("Enter your text to see how text is processed using the Spacy library.","Type Here") nlp_task = ["Tokenization","Lemmatization","NER","POS Tags"] task_choice = st.selectbox("Choose NLP Task",nlp_task) docx = nlp(nlp_text) lemma = [word.lemma_ for word in docx] token = [word.text for word in docx] tag = [word.tag_ for word in docx] depend = [word.dep_ for word in docx] pos = [token.pos_ for token in docx ] if st.button("Analyze"): if task_choice == 'Tokenization': token_df =pd.DataFrame(token, columns = ['Tokens']) st.dataframe(token_df) elif task_choice == 'Lemmatization': lemma_df = pd.DataFrame(zip(token, lemma), columns=['Tokens', 'Lemma']) st.dataframe(lemma_df) elif task_choice == 'NER': html = displacy.render(docx,style="ent") html = html.replace("\n\n","\n") st.write(HTML_WRAPPER.format(html),unsafe_allow_html=True) elif task_choice == 'POS Tags': pos_df=pd.DataFrame(zip(token, tag, depend), columns=['Tokens', 'Tag', 'Dependency']) st.dataframe(pos_df) st.markdown('---') #NLP table st.markdown('## View table of NLP results') st.markdown("Select 'View Table' to view a table of the tokens, lemma and POS tags of your text.") if st.button("View Table"): docx = nlp(nlp_text) table_df = pd.DataFrame(zip(token,lemma,pos),columns=['Tokens','Lemma','POS']) st.dataframe(table_df) st.markdown('---') #Word cloud st.markdown('## Generate text Word Cloud') st.markdown("Select 'Generate Word Cloud' to view a word cloud of the most common words in your text") if st.button("Generate Word Cloud"): wordcloud = WordCloud().generate(nlp_text) plt.imshow(wordcloud) plt.axis("off") st.pyplot() ##### Building out the EDA page ##### if selection == "Exploratory Data Analysis": # You can read a markdown file from supporting resources folder st.markdown("# Exploratory Data Analysis") st.markdown('This page discusses the Exploratory Data Analysis done on the Twitter data received to analyse and to build predictive machine learning models. Here you will find some of the insights from exploring the data as well as visualisations to describe some of our findings.') #Sentiment Description st.markdown("## Sentiment Description") st.markdown("The table displays the description of each sentiment category.") # Image st.image(Image.open(os.path.join("resources/imgs/sentiment_description.png"))) # Show dataset st.markdown("## Raw Twitter data and labels") st.markdown("Select the checkbox to view the original data") if st.checkbox('Show raw dataset'): # data is hidden if box is unchecked st.dataframe(raw_df) # will write the df to the page # Dimensions st.markdown("## Dataframe Dimensions") st.markdown("Select the buttons below to view the number of rows and columns for the raw dataset") data_dim = st.radio('Select dimension',('All','Rows','Columns')) if data_dim == 'All': st.text("Showing Shape of Entire Dataframe") st.info(raw_df.shape) if data_dim == 'Rows': st.text("Showing Length of Rows") st.info(raw_df.shape[0]) if data_dim == 'Columns': st.text("Showing Length of Columns") st.info(raw_df.shape[1]) # Count of labels st.markdown("## Sentiment labels") st.markdown("Below is a table displaying the count of each sentiment in the dataset. Majority of the tweets are positive(1) towards climate change. The least amount of tweets are negative(-1). This means that we have an unbalanced dataset that might have an effect on our prediction models. Select 'Show Bar Graph' to view this information visually.") bar_info = pd.DataFrame(raw_df['sentiment'].value_counts(sort=False)) bar_info.reset_index(level=0, inplace=True) bar_info.columns = ['Sentiment','Count'] bar_info['Percentage'] = [(i/len(raw_df['sentiment'])*100) for i in bar_info['Count']] st.dataframe(bar_info[['Sentiment','Count']]) # Bar Graph if st.button("Show Bar Graph"): sns.set(font_scale=.6) sns.set_style('white') plot = sns.catplot(x="sentiment", kind="count", edgecolor=".6",palette="pastel",data=df_with_metadata,label='small') plot.fig.set_figheight(2.5) plt.xlabel("Sentiment") plt.ylabel("Count") plt.title("Sentiment counts") st.pyplot(bbox_inches="tight") #Clean dataset st.markdown("# Processed dataset") # Clean tweets st.markdown("Select the checkbox to view the processed data with additional information extracted from the text.") if st.checkbox('Show processed dataset'): # data is hidden if box is uncheckedz st.dataframe(df_with_metadata) # Retweets st.markdown("## Retweets") st.markdown("The first thing we look at is the retweets. We find that just over 60% of the tweets are retweets. There is a possibility that some of these retweets are duplicates. We also look at the top 5 most retweeted tweets and how many times they were retweeted.") valuecounts = df_with_metadata['retweet'].value_counts() st.write('No: ', round(valuecounts[1]/len(df_with_metadata['retweet'])*100,2),'%') st.write('Yes: ', round(valuecounts[0]/len(df_with_metadata['retweet'])*100,2),'%') #Bar graph of number of rewteets sns.set(font_scale=.6) sns.set_style('white') plot = sns.catplot(x="retweet", kind="count", edgecolor=".6",palette="pastel",data=df_with_metadata); plt.xlabel("Retweet") plt.ylabel("Count") plt.title("Retweet count") plot.fig.set_figheight(2.5) st.pyplot(bbox_inches="tight") #View the top 10 retweeted tweets tdf = pd.DataFrame(df_with_metadata['message'].astype(str).value_counts()) st.dataframe(tdf[:6]) # Word Cloud - Static wordcloud st.markdown('## Hashtags and Mentions') st.markdown('We can tell a lot from the sentiment of tweets by looking at the hashtags or mentions that are used. Select an option from the dropdown menu to view a Word Cloud of the most common mentions and hashtags. You can also view the top mentions and hashtags per category.') wc_options = ["Top Hashtags", "Top Mentions", "Top Hashtags by Sentiment","Top Mentions by Sentiment"] wc_selection = st.selectbox("Select Word Cloud OPtion", wc_options) if wc_selection=="Top Hashtags": newsimg = Image.open('resources/imgs/TopHashWC.png') st.image(newsimg) elif wc_selection=="Top Mentions": newsimg = Image.open('resources/imgs/TopMentionWC.png') st.image(newsimg) elif wc_selection=="Top Hashtags by Sentiment": newsimg = Image.open('resources/imgs/HashtagCatWC.png') st.image(newsimg, width=700) elif wc_selection=="Top Mentions by Sentiment": newsimg = Image.open('resources/imgs/MentionsCatWC.png') st.image(newsimg, width=700) st.markdown('---') st.markdown('Select a checkbox below to view a table of the top hashtags or mentions for each category and how often they appear:') if st.checkbox('View top hashtags table'): st.dataframe(top_hashtags_df) if st.checkbox('View top mentions table'): st.dataframe(top_mentions_df) st.markdown('---') st.markdown('After looking at the top mentions and hashtags from the wordcloud above and doing some research, we can make a couple of assumptions: \n\n' '- This data seems to be taken from Americans around the time of the 2016 US presidential elections.\n\n' '- **@realDonaldTrump** is the top mentioned account. \n\n' '- **#Climatechange**, **#climate**, and **#Trump#** are the three most used hashtags') # Most Common Words st.markdown("## Most Common Words") st.markdown('If we look at the most common words used, we see the following:\n\n' "- For all the words: **climate**, **change**, **rt**, **global**,and **warming** all are at the top of the word counts. These are top occurrences throughout all categories.\n\n" "- For negative words: **science**, **cause**, **real**, and **scam** stand out as top words that are distinct to the negative category.\n\n" "- For news words: **fight**, **epa**, **pruit**, **scientist**, and **new** stand out as top words that are distinct to the news category.") st.dataframe(top_words_df) # Conclusion #st.markdown("## Conclusion") # Most Common Words st.markdown("## Created by:") st.markdown('\n' "- Karin Louw\n" "- Jonathan Dankers\n" "- Luvuyo Nkosana\n" "- Wright Nyoka\n" "- Kwande Skaap\n" "- Tsholofelo Mautjana")
elif opt == 2: got: str = input( 'Input # of desired results, a semicolon, and search text: ' ) gotargs: List[str] = got.split(';') n: int = int(gotargs[0]) recommendation_list = recommend_courses_using_search_text( gotargs[1], n) pprint(recommendation_list) elif opt == 3: got: str = input('Input course to generate an SVG tree for: ') gotargs: List[str] = got.split(' ') course: cd.Course = cd.Course(gotargs[0], gotargs[1]) with Path(course.program + course.designation + '.svg').open( 'w+', encoding='utf-8') as svg: svg.write( displacy.render(course_nlp_descs[course], style='dep', options={ 'compact': True, 'bg': 'white', 'color': 'black', 'font': 'DejaVu Sans Mono' })) except Exception as e: print('Failed:', e) # 20;CS 2201;EECE 2116;SC 3260;EES 4760;CS 3251;CS 2231;CS 4260;CS 3281;CS 3270;BUS 2100;BUS 2400
def consultaVirutoso(self, texto): # tokenizar texto con spacy text = self.nlp(texto) tokenized_sentences = [sentence.text for sentence in text.sents] # dar estilos al texo analizado spacyText = displacy.render(text, style="ent") # declaras listas vacias datos = [] datostype = [] entidades = [] for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: entidades.append(entity.text) palabra = self.limpiarDatos(entity) consulta = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) . } """ % (palabra, palabra) self.sbcEndpoint.setQuery(consulta) # retornar consulta enformto json self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: lista = [] listaS = result["s"]["value"] listaP = result["p"]["value"] listaO = result["o"]["value"] lista.append(listaS) lista.append(listaP) lista.append(listaO) datos.append(lista) for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: entidades.append(entity.text) palabra = self.limpiarDatos(entity) consultatype = """ PREFIX caseav: <http://localhost:8080/Data/page/> SELECT ?o WHERE { {?s caseav:hasNombrePersona ?o .FILTER (regex(str(?o), "%s")) .} UNION {?s caseav:hashasApellidoPersona ?o .FILTER (regex(str(?o), "%s")) .} UNION {?s caseav:hasCodigo ?o .FILTER (regex(Str(?o), "%s")) .} UNION {?s caseav:hasNombreCompletoPersona ?o .FILTER (regex(Str(?o), "%s")) .} } """ % (palabra, palabra, palabra) self.sbcEndpoint.setQuery(consultatype) # retornar consulta enformto json self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: listae = [] #listaSe = result["s"]["value"] #listaPe = result["p"]["value"] listaOe = result["o"]["value"] #listae.append(listaSe) #listae.append(listaPe) listae.append(listaOe) datostype.append(listae) return datos, entidades, spacyText, datostype
async def nlp_display(request): txt = request.rel_url.query['txt'] style = request.rel_url.query['style'] doc = nlp(txt) svg = displacy.render(doc, style=style) return web.Response(body=svg)
print('\n----') for ent in doc8.ents: print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.") for chunk in doc9.noun_chunks: print(chunk.text) # DISPLAYING from spacy import displacy doc = nlp(u'Apple is going to build a U.K. factory for $6 million.') displacy.render(doc, style='dep', options={'distance': 110}) displacy.render(doc, style='ent', options={'distance': 110}) ################################################################# LEMMATIZATION doc1 = nlp( u"I am a runner running in a race because I love to run since I ran today") for token in doc1: print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_) def show_lemmas(text): for token in text: print( f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}'
import spacy from spacy import displacy nlp = spacy.load('en_core_web_sm') doc = nlp(u'This is a sentence.') doc = nlp(u'Rats are various medium-sized, long-tailed rodents.') displacy.render(doc, style='dep', jupyter=True)
print(token.text,end="|") for i in tok.ents: print(i.text+' '+i.label_+' '+str(spacy.explain(i.label_))) #entity #lets see chunks doc="Honda Plan to start a new plant at Khegegaon worth $78.45 billion" text=model(doc) for i in text.noun_chunks: print(i.text) from spacy import displacy text="Apple to built a Mobile Manufacture company in Hongkong worth $76.8 million" tok=model(text) displacy.render(tok,style='dep',jupyter=True,options={'distance':110}) #parts of speech and realtion of each word with each other from spacy import displacy text="Apple to built a Mobile Manufacture company in Hongkong worth $76.8 million" tok=model(text) displacy.render(tok,style='ent',jupyter=True) # lemmatization Lemmatization, on the other hand, takes into consideration the morphological analysis of the words. To do so, it is necessary to have detailed dictionaries which the algorithm can look through to link the form back to its lemma. Again, you can see how it works with the same example words. Mapping from text-word to lemma help(verb)
def main(): st.title("Your one stop NLP App") expander1 = st.beta_expander("Tokenization & Named Entity Recognition") with expander1: message = st.text_area("Your Text Below") col1, col2 = st.beta_columns(2) with col1: col1.header("Tokenize Your Text") if st.button("Show Entities"): nlp_result = text_analyzer(message) with col2: col2.header("NER") if st.button("Analyze"): docx = analyze_text(message) html = displacy.render(docx, style="ent") html = html.replace("\n\n", "\n") st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) # Sentiment Analysis expander2 = st.beta_expander("View Sentiment") with expander2: st.subheader( "Polarity: It determines if the text expresses the positive, negative or neutral" ) st.markdown(''' #### value lies in the range of [-1,1] 1 : positive statement 0 : neutral statement -1 : negative statement.''') st.subheader( "Subjectivity: It determines if the text is subjective or objective" ) st.markdown(''' #### value lies in the range of [0,1] 0 : Subjective (Has emotions) 1 : Objective (Fact)''') message2 = st.text_area("Enter Your Text Below") if st.button("Show Sentiment metrices"): blob = TextBlob(message2) result_sentiment = blob.sentiment st.success(result_sentiment) # Summarization expander3 = st.beta_expander("Summarize your Text") with expander3: message3 = st.text_area("Add your text below") summary_options = st.selectbox("Choose Summarizer", ['Sumy', 'Gensim']) if st.button("Summarize"): if summary_options == 'Sumy': st.text("Using Sumy Summarizer ..") summary_result = sumy_summarizer(message3) elif summary_options == 'Gensim': st.text("Using Gensim Summarizer ..") summary_result = summarize(message3) else: st.warning("Using Default Summarizer") st.text("Using Gensim Summarizer ..") summary_result = summarize(message3) st.success(summary_result) # Dummy Data Generator expander4 = st.beta_expander("Generate Dummy Data") with expander4: column1, column2, column3 = st.beta_columns(3) with column1: number_to_gen = st.number_input("Number", 5, 5000) with column2: localized_providers = [ "ar_AA", "ar_EG", "ar_JO", "ar_PS", "ar_SA", "bg_BG", "bs_BA", "cs_CZ", "de", "de_AT", "de_CH", "de_DE", "dk_DK", "el_CY", "el_GR", "en", "en_AU", "en_CA", "en_GB", "en_IE", "en_IN", "en_NZ", "en_PH", "en_TH", "en_US", "es", "es_CA", "es_ES", "es_MX", "et_EE", "fa_IR", "fi_FI", "fil_PH", "fr_CA", "fr_CH", "fr_FR", "fr_QC", "he_IL", "hi_IN", "hr_HR", "hu_HU", "hy_AM", "id_ID", "it_CH", "it_IT", "ja_JP", "ka_GE", "ko_KR", "la", "lb_LU", "lt_LT", "lv_LV", "mt_MT", "ne_NP", "nl_BE", "nl_NL", "no_NO", "or_IN", "pl_PL", "pt_BR", "pt_PT", "ro_RO", "ru_RU", "sk_SK", "sl_SI", "sv_SE", "ta_IN", "th", "th_TH", "tl_PH", "tr_TR", "tw_GH", "uk_UA", "zh_CN", "zh_TW" ] locale = st.multiselect("Select Locale", localized_providers, default="en_IN") with column3: profile_options_list = [ 'username', 'name', 'sex', 'address', 'mail', 'birthdate' 'job', 'company', 'ssn', 'residence', 'current_location', 'blood_group', 'website' ] profile_fields = st.multiselect("Fields", profile_options_list, default=['username', 'mail']) custom_fake = Faker(locale) data = [ custom_fake.profile(fields=profile_fields) for i in range(number_to_gen) ] df = pd.DataFrame(data) st.dataframe(df) if st.button("Download"): make_downloadable_df_format(df)
#ps = PorterStemmer() #review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) aspect_term.append(review) dataset['loc_start'] = dataset[' term_location'].replace("--[0-9]+", "", regex=True) dataset['end'] = dataset[' term_location'].replace("[0-9]+--", "", regex=True) loc_start = [] loc_start = dataset['loc_start'].tolist() loc_end = [] loc_end = dataset['end'].tolist() from spacy import displacy displacy.render(dep_par[4], style='dep') def check_similarity(text, aspect): for word in aspect.split(): #print word,text if text == word: #print("True") return True return False from spacy.symbols import nsubj, VERB, ADJ, amod, NOUN, acomp, dep, advmod, ccomp, pobj, prep, dobj, ADV, neg, attr def get_adj(i, start_loc, end_loc):
def _render_parses(i, to_render): to_render[0].user_data["title"] = "Batch %d" % i with Path("/tmp/parses.html").open("w") as file_: html = displacy.render(to_render[:5], style="dep", page=True) file_.write(html)
def render_pos_html(list_of_docs): return displacy.render(map(lambda x: get_spacy_doc(x), list_of_docs), style='dep', page=True)
# #display text # text_placeholder.write(text, unsafe_allow_html=True) #render waveform graph data = stream.read(CHUNK) #gives it to you in bytes npdata = np.frombuffer(data, np.int16) line.set_ydata(npdata) fig.canvas.draw() fig.canvas.flush_events() waveform_placeholder.write(fig) doc = nlp(text) #render HTML for NER tags html = displacy.render(doc, style="ent", options={"ents": ner_selection}) ner_visual_placeholder.write(get_html_textbox(html), unsafe_allow_html=True) #Get sentiment if model in ['en_core_web_sm']: _, sentiment, sentiment_colour = get_polarity(doc) sentiment_placeholder.write(get_html_sentiment(sentiment, sentiment_colour), unsafe_allow_html=True) _, subjectivity, subjectivity_colour = get_subjectivity(doc) subjectivity_placeholder.write(get_html_subjectivity( subjectivity, subjectivity_colour), unsafe_allow_html=True) else: polarity_placeholder.write(
def cy(x): return displacy.render(x, options=dict(compact=True, collapse_phrases=True, word_spacing=15, distance=100))
colors[format_label(label, "reject")] = COLOR_REJECT ner_example_i = st.selectbox( f"Merged examples ({len(merged_examples)})", range(len(merged_examples)), format_func=lambda i: merged_examples[int(i)]["text"][:400], ) ner_example = merged_examples[int(ner_example_i)] doc = nlp.make_doc(ner_example["text"]) ents = [] for span in ner_example.get("spans", []): label = format_label(span["label"], span["answer"]) ents.append( doc.char_span(span["start"], span["end"], label=label)) doc.ents = filter_spans(ents) html = displacy.render(doc, style="ent", options={"colors": colors}) html = html.replace( "\n", " ") # Newlines seem to mess with the rendering st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) show_ner_example_json = st.checkbox("Show JSON example") if show_ner_example_json: st.json(ner_example) st.subheader("Train a model (experimental)") no_missing = st.checkbox( "Data is gold-standard and contains no missing values", False) start_blank = st.checkbox("Start with blank NER model", True) if st.button("🚀 Start training"): if start_blank: ner = nlp.create_pipe("ner")
import spacy from spacy import displacy import sys import os file = open(sys.argv[1], 'r') text = file.read() file.close() prdnlp = spacy.load("neural network annotator") # doc = prdnlp(text) html = displacy.render(doc, style="ent") Html_file = open("render.html", "w") Html_file.write(html) Html_file.close()
def main(): ''' NERS Demo w/ Sample Data ''' # CONFIG -------------------------------------------------- \\ # ------------------------------------------------------------ \\ # brnd, mpn, spplr model = 'post' # pre -> use non-trained model / post -> use trained model mpn = 'on' # on/off brnd = 'off' # on/off cmmdty = 'off' # on/off ruler = 'on' cleaner = 'on' number_tagger = 'off' # rem if stemmer is turned on after model does P2 training, then # you will need to use POS tag to detect nouns in products # then create new generator patterns for all.json # then run entity ruler again stemmer = 'off' # declare outputs brnd_pandas_file = r'C:\Users\stacy\Desktop\NERS Demo\out_pandas_brnd.xlsx' # output mpn_pandas_file = r'C:\Users\stacy\Desktop\NERS Demo\out_pandas_mpn.xlsx' # output cmmdty_pandas_file = r'C:\Users\stacy\Desktop\NERS Demo\out_pandas_cmmdty.xlsx' # output # declare inputs mpn_file = r'C:\Users\stacy\Desktop\NERS Demo\ners_patterns_mpn.jsonl' # input brnd_file = r'C:\Users\stacy\Desktop\NERS Demo\ners_patterns_brnd.jsonl' # input cmmdty_file = r'C:\Users\stacy\Desktop\NERS Demo\ners_patterns_cmmdty.jsonl' # input # mpn brnd cmmdty cases # 0 0 0 C1 # 1 0 0 C2 # 0 1 0 C3 # 0 0 1 C4 # 1 1 0 C5 # 0 1 1 C6 # 1 0 1 C7 # 1 1 1 C8 if mpn == 'off' and brnd == 'off' and cmmdty == 'off': # C1 patterns_file = mpn_file elif mpn == 'on' and brnd == 'off' and cmmdty == 'off': # C2 patterns_file = mpn_file elif mpn == 'off' and brnd == 'on' and cmmdty == 'off': # C3 patterns_file = brnd_file elif mpn == 'off' and brnd == 'off' and cmmdty == 'on': # C4 patterns_file = cmmdty_file elif mpn == 'on' and brnd == 'on' and cmmdty == 'off': # C5 patterns_file = combine_pattern_files(mpn_file, brnd_file) elif mpn == 'off' and brnd == 'on' and cmmdty == 'on': # C6 patterns_file = combine_pattern_files(brnd_file, cmmdty_file) elif mpn == 'on' and brnd == 'off' and cmmdty == 'on': # C7 patterns_file = combine_pattern_files(mpn_file, cmmdty_file) elif mpn == 'on' and brnd == 'on' and cmmdty == 'on': # C8 patterns_file = combine_pattern_files(mpn_file, brnd_file, cmmdty_file) tender_file = r'C:\Users\stacy\Desktop\NERS Demo\in_tender.csv' #tender_file = r'C:\Users\stacy\Desktop\NERS Demo\descriptions_nonstock.csv' write_type = 'w' # ------------------------------------------------------------ // # ---------------------------------------------------------- // # load model if model == 'pre': # load a language and invoke the entity ruler nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) #('en_core_web_sm', disable=['parser']) elif model == 'post': nlp = spacy.load('demo_model') nlp.add_pipe(sentence_segmenter, after='tagger') # add pipes if ruler == 'on': # rem if model is post then the entity ruler is already in the model if model == 'pre': # load patterns from external file only if model is not already trained nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ruler before ner will override ner decisions in favor of ruler patterns nlp.add_pipe(nu_ruler)#, before='ner') # remember to swap precedence between ruler and ner after model training if model == 'post': # load patterns from external file only if model is not already trained if "entity_ruler" not in nlp.pipe_names: nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ner before ruler will override favor ner decisions nlp.add_pipe(nu_ruler)#, before='ner') # show pipeline components: print(nlp.pipe_names) # import test tender and clean it up tender = import_csv(tender_file) # import if cleaner == 'on': tender = py_string_cleaner.clean_doc(tender) # clean doc = nlp(tender) # CONSOLE OUTPUT --------------------------------------------------------- # mpn brnd cmmdty cases # 0 0 0 C1 # 1 0 0 C2 # 0 1 0 C3 # 0 0 1 C4 # 1 1 0 C5 # 0 1 1 C6 # 1 0 1 C7 # 1 1 1 C8 labels = [] alt_labels = [] if mpn == 'on' and brnd == 'off' and cmmdty == 'off': print('\n') labels = ['MPN'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Mpn'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) if mpn == 'off' and brnd == 'on': print('\n') labels = ['BRND'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Brnd'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) if mpn == 'on' and brnd == 'on': print('\n') labels = ['BRND', 'MPN'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Brnd', 'Mpn'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) # pandas output for mpns ------------------------------------------------ # This technique allows you to isolate entities on # a sentence-by-sentence basis, which will allow # for matching entities on a record-by-record basis if mpn == 'on': w_MpnCodes = [] w_MpnCode_Alts = [] unique = [] mpn = '' alts = '' #ent_exists = False j = 0 for sent in doc.sents: i = 0 for ent in sent.ents: # ignore header record if j > 0: if ent.label_ == 'MPN': if i == 0: # if it's the first label in the record, save it in mpns mpn = ent.text unique.append(ent.text) i += 1 else: # if it's not the first label in the sentence, put it in mpn alts # (if it is already in alts, don't put it in) if ent.text not in unique: unique.append(ent.text) if alts == '': alts = ent.text else: alts = alts + ', ' + ent.text #print(ent.label_, ': ', ent.text) # store ent results for each record, ignoring the headers if j > 0: w_MpnCodes.append(mpn.upper()) w_MpnCode_Alts.append(alts.upper()) # test --------------- print('str ', j, 'w_MpnCodes: ', w_MpnCodes) print('str ', j, 'w_MpnCode_Alts: ', w_MpnCode_Alts) # test --------------- # reset vars for next record unique.clear() mpn = '' alts = '' j += 1 df = pd.DataFrame({ 'w_MpnCodes':w_MpnCodes, 'w_MpnCode_Alts':w_MpnCode_Alts}) writer = pd.ExcelWriter(mpn_pandas_file) df.to_excel(writer,'NERS_MPNs', index=False) writer.save() # pandas output for brnds ------------------------------------------------ # This technique allows you to isolate entities on # a sentence-by-sentence basis, which will allow # for matching entities on a record-by-record basis if brnd == 'on': w_Brnds = [] w_Brnd_Alts = [] unique = [] brnd_val = '' alts = '' #ent_exists = False j = 0 for sent in doc.sents: i = 0 for ent in sent.ents: # ignore header record if j > 0: if ent.label_ == 'BRND': if i == 0: # if it's the first label in the record, save it in brnd brnd_val = ent.text unique.append(ent.text) i += 1 else: # if it's not the first label in the sentence, put it in brnd alts # (if it is already in alts, don't put it in) if ent.text not in unique: unique.append(ent.text) if alts == '': alts = ent.text else: alts = alts + ', ' + ent.text #print(ent.label_, ': ', ent.text) # store ent results for each record, ignoring the headers if j > 0: w_Brnds.append(brnd_val.upper()) w_Brnd_Alts.append(alts.upper()) # test --------------- print('str ', j, 'w_Brnds: ', w_Brnds) print('str ', j, 'w_Brnd_Alts: ', w_Brnd_Alts) # test --------------- # reset vars for next record unique.clear() brnd_val = '' alts = '' j += 1 df2 = pd.DataFrame({ 'w_Brnds':w_Brnds, 'w_Brnd_Alts':w_Brnd_Alts}) writer2 = pd.ExcelWriter(brnd_pandas_file) df2.to_excel(writer2,'NERS_Brnds', index=False) writer2.save() # pandas output for cmmdty ------------------------------------------------ # This technique allows you to isolate entities on # a sentence-by-sentence basis, which will allow # for matching entities on a record-by-record basis if cmmdty == 'on': w_Cmmdtys = [] w_Cmmdty_Alts = [] unique = [] cmmdty_val = '' alts = '' #ent_exists = False j = 0 for sent in doc.sents: i = 0 for ent in sent.ents: # ignore header record if j > 0: if ent.label_ == 'CMMDTY': if i == 0: # if it's the first label in the record, save it in brnd cmmdty_val = ent.text unique.append(ent.text) i += 1 else: # if it's not the first label in the sentence, put it in brnd alts # (if it is already in alts, don't put it in) if ent.text not in unique: unique.append(ent.text) if alts == '': alts = ent.text else: alts = alts + ', ' + ent.text #print(ent.label_, ': ', ent.text) # store ent results for each record, ignoring the headers if j > 0: w_Cmmdtys.append(cmmdty_val.upper()) w_Cmmdty_Alts.append(alts.upper()) # test --------------- print('str ', j, 'w_Cmmdty: ', w_Cmmdtys) print('str ', j, 'w_Cmmdty_Alts: ', w_Cmmdty_Alts) # test --------------- # reset vars for next record unique.clear() brnd_val = '' alts = '' j += 1 df3 = pd.DataFrame({ 'w_Cmmdtys':w_Cmmdtys, 'w_Cmmdty_Alts':w_Cmmdty_Alts}) writer3 = pd.ExcelWriter(cmmdty_pandas_file) df3.to_excel(writer3,'NERS_Cmmdtys', index=False) writer3.save() # save the model -------------------------------------------------------- # save model with entity pattern updates made by the entity ruler if ruler == "on": output_dir = Path('demo_model') if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # TEST ----------------------------- mpns = [] # DISPLACY VISUALIZER ----------------------------------------------------- # get results for html doc results = '' i = 0 for item in alt_labels: results = results + '{}: {} tot {} unq\n'.format(item, total_found[i], total_unique_found[i]) i += 1 # store nlp object as string in html var spacer = '---------------------------------------------------------\n' header = 'Named Entities Found in Target File:\n' doc = nlp(header + spacer + results + spacer + tender) doc.user_data["title"] = "Named Entity Resolution System (NERS)" colors = { "MPN": "#C3FFA1", "BRND": "#FFDDA1", "CMMDTY": "#F3DDA1" } options = {"ents": ["MPN", "BRND", "CMMDTY"], "colors": colors} # displacy.serve(doc, style="ent", options=options) html = displacy.render(doc, style="ent", page=True, options=options) # use the entity visualizer # write the html string to the xampp folder and launch in browser through localhost port with open('C:/Users/stacy/My Localhost/index.html', 'w') as data: data.write(html) print('\n' + results) # end program print('Done.')
if "parser" in nlp.pipe_names: st.header("Dependency Parse & Part-of-speech tags") st.sidebar.header("Dependency Parse") split_sents = st.sidebar.checkbox("Split sentences", value=True) collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True) collapse_phrases = st.sidebar.checkbox("Collapse phrases") compact = st.sidebar.checkbox("Compact mode") options = { "collapse_punct": collapse_punct, "collapse_phrases": collapse_phrases, "compact": compact, } docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] for sent in docs: html = displacy.render(sent, options=options) # Double newlines seem to mess with the rendering html = html.replace("\n\n", "\n") if split_sents and len(docs) > 1: st.markdown(f"> {sent.text}") st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) if "ner" in nlp.pipe_names: st.header("Named Entities") st.sidebar.header("Named Entities") default_labels = ["PERSON", "ORG", "GPE", "LOC"] labels = st.sidebar.multiselect("Entity labels", nlp.get_pipe("ner").labels, default_labels) html = displacy.render(doc, style="ent", options={"ents": labels}) # Newlines seem to mess with the rendering html = html.replace("\n", " ")
def tree_vis(fmted_dep_tree): displacy.render(fmted_dep_tree, style="dep", manual=True, page=False, minify=True)
# target_labels = ['Finding', 'Disease or Syndrome', # 'Sign or Symptom', 'Pathologic Function', 'Neoplastic Process', 'Other'] # st.write(doc.ents) # for ent in doc.ents: # st.write(ent.text, ' - ', ent.label_) html = displacy.render(doc, style="ent", options={ "ents": [ 'FINDING', 'DISEASE OR SYNDROME', 'SIGN OR SYMPTOM', 'PATHOLOGIC FUNCTION', 'NEOPLASTIC PROCESS', 'OTHER' ], "colors": { 'FINDING': '#D0ECE7', 'DISEASE OR SYNDROME': '#D6EAF8', 'SIGN OR SYMPTOM': '#E8DAEF', 'PATHOLOGIC FUNCTION': '#FADBD8', 'NEOPLASTIC PROCESS': '#DAF7A6' } }) style = "<style>mark.entity { display: inline-block }</style>" st.write(f"{style}{get_html(html)}", unsafe_allow_html=True) data = [[ str(getattr(ent, attr)) for attr in ["text", "label_", "start", "end", "start_char", "end_char"] ] for ent in doc.ents # if ent.label_ in target_labels
def render_output( output: Optional[Union[Dict[str, str], QuestionAnsweringOutput]] = None, answer: Optional[str] = None, context: Optional[str] = None, question: Optional[str] = None, label: str = 'ANSWER', title: str = 'Question', grad_deg: str = '90deg', grad_pair: List[str] = ['#aa9cfc', '#fc9ce7'], span: Optional[Tuple[int, int]] = None, style: str = "ent", manual: bool = True, jupyter: bool = True, page: bool = False, minify: bool = True, return_html: bool = False, manual_data: Optional[Dict[str, Any]] = None, options: Optional[Dict[str, Any]] = None, ): """DisplaCy Visualizer for QA-Model Outputs. :param output: An output from the question-answering model. The output can be a dictionary with any or all keys: `question, answer, context`. Or a `QuestionAnsweringOutput` type object - If answer param is None, then the first `top-scored` answer will be chosen automatically. :param answer: (optional) A string sequence to represent as the answer. :param context: (optional) A list of string sequences or a single string to represet as the context (if `List[str]` - sequences will be joined). :param span: Span for highlighting the answer within the context. If None, its detected automatically. :param options: Visualizer options; visit the link for official DOCS: `https://spacy.io/api/top-level#displacy_options` :param manual_data: Defaults to ENT, keys; `'text', 'ents', 'titles'` DOCS: `https://spacy.io/usage/visualizers#manual-usage` """ if output is not None: if isinstance(output, dict): if 'question' in output: question = output['question'] if 'answer' in output: answer = output['answer'] if 'context' in output: context = output['context'] elif all(hasattr(output, attr) for attr in ('q', 'c', 'sids')): question, context = output.q, output.c # select the first top answer, if none provided. if answer is None: answer = output[0].answer if context is not None: if isinstance(context, list): context = ' '.join(context) e = f'Found item in List[{type(context[0])}], but expected List[str]' assert isinstance(context[0], str), e start, end = span if span is not None else (0, 0) if span is None: match = re.search(answer, context) if match and match.span() is not None: start, end = match.span() docs = dict() if manual_data is None else manual_data if manual_data is None: if style == "ent": docs["ents"] = [dict(start=start, end=end, label=label)] if len(context.strip()) > 1: docs['text'] = context if question is not None: docs['title'] = f"\n{title}: {question}\n" if options is None: if style == "dep": options = dict(compact=True, bg="#ed7118", color="#000000") else: options = dict(ents=None, colors=None) gradient = ", ".join([grad_deg] + grad_pair) colors = f"linear-gradient({gradient})" options.update({'ents': [label], 'colors': {label: colors}}) if return_html: return displacy.render([docs], style=style, jupyter=False, options=options, manual=manual) displacy.render([docs], style=style, page=page, minify=minify, jupyter=jupyter, options=options, manual=manual)
def main(): #st.sidebar.title("About") if st.sidebar.button("About this app"): st.sidebar.info( "This is an auto summarizer app for text articles, extracting the most important sentences by using NLP algorithms. It helps us to save time in our busy schedules who prefer to read the summary of those articles before we decide to jump in for reading entire article." ) #st.write('<style>body { margin: 0; font-family: font-family: Tangerine;font-size:48px, Helvetica, sans-serif;font-size: 30px;text-align: center;} .header{padding: 10px 16px; background: #eaf4ff; color: #111; position:fixed;top:0;text-align: center;} .sticky { position: center; top: 0; width: 100%;} </style><div class="header" id="myHeader">'+str('RESUNER')+'</div>', unsafe_allow_html=True) st.write( '<style>body { margin: 0; font-family: font-family: Tangerine;font-size:48px, Helvetica, sans-serif;font-size: 30px;text-align: justify;} .header{padding: 10px 16px; background: #eaf4ff; color: #111; position:fixed;top:0;text-align: center;} .sticky { position: fixed; top: 0; width: 100%;} </style><div class="header" id="myHeader">' + str('Summary Generator and Entity Recognizer') + '</div>', unsafe_allow_html=True) #st.title("Summary Generator and Entity checker") activities = [ "Summarize", "Summarize for URL", "NER Checker", "NER for URL" ] choice = st.radio("Select Activity", activities) if choice == 'Summarize': st.info( "Please paste your text into the left side box & click the 'Summarize!' to view the summary" ) st.sidebar.subheader("Summarization") raw_text = st.sidebar.text_area("Enter Text Here") #summary_choice = st.selectbox("Summary Choice",["Gensim","Sumy Lex Rank"]) if st.sidebar.button("Summarize!"): summary_result = sumy_summarizer(raw_text) estimatedTime_org = readingTime(raw_text) #text_length = st.slider("Length to Preview",50,100) st.info( "Original Reading time - {} mins".format(estimatedTime_org)) st.write(summary_result) estimatedTime_res = readingTime(summary_result) st.info("Summary Reading time - {} mins".format(estimatedTime_res)) engine = pyttsx3.init(driverName='sapi5') #infile = "tanjil.txt" # f = open(infile, 'r') #theText = f.read() #f.close() #Saving part starts from here tts = gTTS(text=summary_result, lang='en') #saved_file=talkToMe(summary_result , lgg ='en') tts.save("saved_file.mp3") audio_file = open('saved_file.mp3', 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format='audio/mp3', start_time=0) st.sidebar.subheader("Visualizations") visualize = ["Select", "WordCloud", "Bigrams", "Trigrams"] choice2 = st.sidebar.selectbox("Visualize", visualize) #if choice2 == "Only Summary": if choice2 == "WordCloud": c_text = raw_text #plt.figure(figsize=[70,50]) maskArray = np.array(Image.open("comment.png")) wordcloud = WordCloud(max_font_size=200, max_words=3000, margin=10, background_color='white', mask=maskArray, contour_width=3, contour_color='black', scale=3, relative_scaling=0.5, width=1900, height=1900, random_state=1).generate(c_text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") st.pyplot() if choice2 == "Bigrams": c_text = ngrams((raw_text), 2) for i in range(0, len(c_text)): c_text[i] = " ".join(c_text[i]) Bigram_Freq = nltk.FreqDist(c_text) maskArray = np.array(Image.open("comment.png")) #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq) #plt.figure(figsize = (50,25)) bigram_wordcloud = WordCloud( max_font_size=150, max_words=2000, margin=10, background_color='white', mask=maskArray, contour_width=3, contour_color='black', scale=3, relative_scaling=0.5, width=900, height=900, random_state=1).generate_from_frequencies(Bigram_Freq) #plt.figure(figsize = (50,25)) plt.imshow(bigram_wordcloud, interpolation='bilinear') plt.axis("off") # maskArray = np.array(Image.open("C:/Users/NAKKANA1/OneDrive - Novartis Pharma AG/Desktop/aws_study/streamlit/wordcloudsummy/cloud2.png")) #wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width,stopwords=stopwords_wc, background_color='white', mask = maskArray).generate_from_frequencies(dict(words_freq)) # wordCloud = WordCloud(max_font_size=150,max_words=2000, margin=10, background_color='white', mask = maskArray, # scale=3, relative_scaling = 0.5, width=900, height=900,random_state=1).generate_from_frequencies(c_text) # plt.title('Most frequently occurring bigrams connected by same colour and font size') # plt.imshow(wordCloud, interpolation='bilinear') # plt.axis("off") #return st.pyplot() st.pyplot() if choice2 == "Trigrams": c_text = ngrams((raw_text), 3) for i in range(0, len(c_text)): c_text[i] = " ".join(c_text[i]) trigram_Freq = nltk.FreqDist(c_text) maskArray = np.array(Image.open("comment.png")) #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq) #plt.figure(figsize = (50,25)) trigram_wordcloud = WordCloud( max_font_size=150, max_words=200, margin=10, background_color='white', mask=maskArray, contour_width=3, contour_color='black', scale=3, relative_scaling=0.5, width=900, height=900, random_state=1).generate_from_frequencies(trigram_Freq) #plt.figure(figsize = (50,25)) plt.imshow(trigram_wordcloud, interpolation='bilinear') plt.axis("off") st.pyplot() #st.write('<style>body { margin: 0; font-family: Arial, Helvetica, sans-serif;} .header{padding: 10px 16px; background: #7f78d2; color: #f1f1f1; position:fixed;top:0;} .sticky { position: fixed; top: 0; width: 100%;} </style><div class="header" id="myHeader">'+str('Summarator')+'</div>', unsafe_allow_html=True) if choice == 'NER Checker': st.info( "About NER Checker: Named-entity recognition (NER) automatically identifies names of people, places, products & organizations. The entities displayed here is PERSON, NORP (nationalities, religious and political groups), FAC (buildings, airports etc.), ORG (organizations), GPE (countries, cities etc.), LOC (mountain ranges, water bodies etc.), PRODUCT (products), EVENT (event names), WORK_OF_ART (books, song titles), LAW (legal document titles), LANGUAGE (named languages), DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL and CARDINAL" ) st.sidebar.subheader("Entity Recognition") raw_text = st.sidebar.text_area("Enter Text Here", "Type Here") if st.sidebar.button("Analyze!"): # NLP docx = analyze_text(raw_text) html = displacy.render(docx, style='ent') html = html.replace("\n\n", "\n") #st.write(html,unsafe_allow_html=True) st.markdown(html, unsafe_allow_html=True) if choice == 'NER for URL': st.info( "About NER Checker: Named-entity recognition (NER) automatically identifies names of people, places, products & organizations. The entities displayed here is PERSON, NORP (nationalities, religious and political groups), FAC (buildings, airports etc.), ORG (organizations), GPE (countries, cities etc.), LOC (mountain ranges, water bodies etc.), PRODUCT (products), EVENT (event names), WORK_OF_ART (books, song titles), LAW (legal document titles), LANGUAGE (named languages), DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL and CARDINAL" ) st.sidebar.subheader("Analyze text from URL") raw_url = st.sidebar.text_input("Enter URL Here", "Type here") #text_preview_length = st.slider("Length to Preview",50,100) if st.sidebar.button("Analyze"): if raw_url != "Type here": result = get_text(raw_url) # len_of_full_text = len(result) # len_of_short_text = round(len(result)/text_preview_length) # st.success("Length of Full Text::{}".format(len_of_full_text)) # st.success("Length of Short Text::{}".format(len_of_short_text)) # st.info(result[:len_of_short_text]) #summarized_docx = sumy_summarizer(result) docx = analyze_text(result) html = displacy.render(docx, style="ent") html = html.replace("\n\n", "\n") #st.write(HTML_WRAPPER1.format(html),unsafe_allow_html=True) st.markdown(html, unsafe_allow_html=True) if choice == 'Summarize for URL': st.info( "Please paste your url into the left side box & click the 'Summarize!' to view the summary" ) st.sidebar.subheader("Summary from URL") raw_url = st.sidebar.text_input("Enter URL", "Type here") #text_length = st.sidebar.slider("Length to Preview",50,100) # text_length = st.slider("Length to Preview",50,100) if st.sidebar.button("Summarize!"): if raw_url != "Type here": result = get_text(raw_url) estimatedTime_org_url = readingTime(result) #text_length = st.slider("Length to Preview",50,100) #st.info("Original Reading time - {} mins".format(estimatedTime_org_url)) #len_of_full_text = len(result) #len_of_short_text = round(len(result)/text_length) #st.info("Length::Full Text::{}".format(len_of_full_text)) #st.info("Length::Short Text::{}".format(len_of_short_text)) #st.write(result[:len_of_short_text]) summary_result_url = sumy_summarizer(result) st.write(summary_result_url) estimatedTime_res_url = readingTime(summary_result_url) st.info("Summary Reading time - {} mins".format( estimatedTime_res_url)) engine = pyttsx3.init(driverName='sapi5') #infile = "tanjil.txt" # f = open(infile, 'r') #theText = f.read() #f.close() #Saving part starts from here tts = gTTS(text=summary_result_url, lang='en') #saved_file2=talkToMe(summary_result_url , lgg ='en') tts.save("saved_file3.mp3") audio_file2 = open('saved_file3.mp3', 'rb') audio_bytes2 = audio_file2.read() st.audio(audio_bytes2, format='audio/mp3', start_time=0) st.sidebar.subheader("Visualizations") visualize = ["Select", "WordCloud", "Bigrams", "Trigrams"] choice2 = st.sidebar.selectbox("Visualize", visualize) #if choice2 == "Only Summary": if choice2 == "WordCloud": if raw_url != "Type here": result = get_text(raw_url) c_text = result #plt.figure(figsize=[70,50]) maskArray = np.array(Image.open("comment.png")) wordcloud = WordCloud(max_font_size=200, max_words=3000, margin=10, background_color='white', mask=maskArray, contour_width=3, contour_color='black', scale=3, relative_scaling=0.5, width=1900, height=1900, random_state=1).generate(c_text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") st.pyplot() if choice2 == "Bigrams": if raw_url != "Type here": result = get_text(raw_url) c_text = ngrams((result), 2) for i in range(0, len(c_text)): c_text[i] = " ".join(c_text[i]) Bigram_Freq_u = nltk.FreqDist(c_text) maskArray = np.array(Image.open("comment.png")) #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq) #plt.figure(figsize = (50,25)) bigram_wordcloud_u = WordCloud( max_font_size=150, max_words=2000, margin=10, background_color='white', mask=maskArray, contour_width=3, contour_color='steelblue', scale=3, relative_scaling=0.5, width=900, height=900, random_state=1).generate_from_frequencies(Bigram_Freq_u) #plt.figure(figsize = (50,25)) plt.imshow(bigram_wordcloud_u, interpolation='bilinear') plt.axis("off") st.pyplot() if choice2 == "Trigrams": if raw_url != "Type here": result = get_text(raw_url) c_text = ngrams((result), 3) for i in range(0, len(c_text)): c_text[i] = " ".join(c_text[i]) trigram_Freq_u = nltk.FreqDist(c_text) maskArray = np.array(Image.open("comment.png")) #bigram_wordcloud = WordCloud(random_state = 21).generate_from_frequencies(Bigram_Freq) #plt.figure(figsize = (50,25)) trigram_wordcloud_u = WordCloud( max_font_size=150, max_words=200, margin=10, background_color='white', mask=maskArray, contour_width=3, contour_color='black', scale=3, relative_scaling=0.5, width=900, height=900, random_state=1).generate_from_frequencies(trigram_Freq_u) #plt.figure(figsize = (50,25)) plt.imshow(trigram_wordcloud_u, interpolation='bilinear') plt.axis("off") st.pyplot() st.sidebar.title("") st.sidebar.info("Connect: [email protected]")
def main(): nlp = spacy.load('en_core_web_sm') # sentence functions print('setence example: ---------------------------') sent = nlp(sentences[0]) print(sent.text) for token in sent: print(token.text, token.pos_, token.dep_) print('\n') # string example functions print('string example: ---------------------------') sampleString = u"I can't imagine spending $3000 for a single bedroom apartment in N.Y.C." str = nlp(sampleString) print(str.text) for token in str: print(token.text, token.pos_, token.dep_) print('\n') # product file functions print('products example 1: ---------------------------') infile = open('products_DescriptionOnly_short.csv', 'rt') print(infile.read(), '\n') # reset cursor infile.seek(0) # start for for line in infile: nextLine = line.rstrip() # nextStr = nextLine nlpStr = nlp(nextLine) for token in nlpStr: print(token.text, token.pos_, token.dep_) print('\n') # end for # close input data file infile.close() # product file functions (2) print('products example 2: ---------------------------') # print all data infile = open('products_DescriptionOnly.csv', 'rt') fData = infile.read() # the doc object is processed as it is passed # to the language object nlpData = nlp(fData) print(nlpData) # print tokens print('\ntokens:') for tok in nlpData[:6]: print('{} -> {} -> {}'.format(tok.text, tok.pos_, tok.ent_type_)) # print entities print('\nentities:') for ent in nlpData.ents: print('{} --> {}'.format(ent.string, ent.label_)) # print persons # rem: NLTK comes with pre-trained models for splitting text # to sentences and sentences to words print('\n') orgNum = 0 carNum = 0 perNum = 0 print('ORGs:') for ent in nlpData.ents: if ent.label == spacy.symbols.ORG: orgNum += 1 print(ent.text) if ent.label == spacy.symbols.CARDINAL: carNum += 1 if ent.label == spacy.symbols.PERSON: perNum += 1 # end if print('\n') print('# of ORG: ', orgNum) print('# of CARDINAL: ', carNum) print('# of PERSON: ', perNum) infile.close() # examine additional spacy functions print('\nexplore additional spacy functions: ---------------') for token in nlpData[:6]: print('token.text: ', token.text) # the original string print('token.ent_type_: ', token.ent_type_) # entity print('token.ent_iob_: ', token.ent_iob_) # ? print('token.pos_: ', token.pos_) # the part of speech print('token.tag_: ', token.tag_) # ? print('token.dep_: ', token.dep_) # dependency print('token.head.text: ', token.head.text) # navigate up the tree print('token.lefts: ', token.lefts) # left child of head print('token.rights: ', token.rights) # right child of head print('\n-----------------') # apply more spacy features to a string nuDoc = nlp('This is an SKF product called Ball Bearing for $45 USD') for token in nuDoc: print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}'.format( token.text, # original string token.idx, # index token.lemma_, # base form of the word token.is_punct, # bool: is it punctuation token.is_space, # bool: is it a space token.shape_, # visual signature ie: Xxxxx token.pos_, # part of speech token.tag_ # ? )) # end for # test displaCy # viewable in jupyter notebook print( '\ndisplaCy snippet for jupyter notebook ---------------------------') doc = nlp( 'I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ' ) displacy.render(doc, style='ent', jupyter=True) # test the chunker print('\ntest the chunker 1 -----------') doc = nlp( "Wall Street Journal just published an interesting piece on crypto currencies" ) for chunk in doc.noun_chunks: print(chunk.text, chunk.label_, chunk.root.text) # test the chunker print('\ntest the chunker 2 -----------') doc = nlp( 'Bore Diameter 40mm inner ring width 23 mm spherial roller bearing') for chunk in doc.noun_chunks: print(chunk.text, chunk.label_, chunk.root.text) # test span object print('\ntest span object -----------') span = doc[2:6] # 40mm inner ring print(span.text) # test lexical attributes print('\ntest lexical attributes ---------------') doc = nlp("It costs $5.") print('Text: ', 'It costs $5') print('Index: ', [token.i for token in doc]) print('Text: ', [token.text for token in doc]) print('is_alpha:', [token.is_alpha for token in doc]) print('is_punct:', [token.is_punct for token in doc]) print('like_num:', [token.like_num for token in doc]) # test the dependency parcer print('\ntest the dependency parcer -----------') doc = nlp( 'Wall Street Journal just published an interesting piece on crypto currencies' ) for token in doc: print("{0}/{1} <--{2}-- {3}/{4}".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_)) # end program print('\nDone.')
def entity(self, filename): nlp = spacy.load("en_core_web_sm") doc = nlp(self.text) html_entity = displacy.render(doc, style="ent") output_path = Path(filename) output_path.open("w", encoding="utf-8").write(html_entity)
def test_displacy_spans(en_vocab): """Test that displaCy can render Spans.""" doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] html = displacy.render(doc[1:4], style="ent") assert html.startswith("<div")
def visualizeParsedText(sent): doc = nlp(sent) displacy.render(doc, style='dep', jupyter=True)
def visualize_ent(doc, context=True, sections=True, jupyter=True, colors=None): """Create a NER-style visualization for targets and modifiers in Doc. doc (Doc): A spacy doc context (bool): Whether to display the modifiers generated by medSpaCy's cycontext. If the doc has not been processed by context, this will be automatically changed to False. Default True. sections (bool): Whether to display the section titles generated by medSpaCy's sectionizer (still in development). If the doc has not been processed by sectionizer , this will be automatically changed to False. This may also have some overlap with cycontext, in which case duplicate spans will be displayed. Default True. jupyter (jupyter): If True, will render directly in a Jupyter notebook. If False, will return the HTML. Default True. colors (dict or None): An optional dictionary which maps labels of targets and modifiers to color strings to be rendered. If None, will create a generator which cycles through the default matplotlib colors for ent and modifier labels and uses a light gray for section headers. Default None. """ # Make sure that doc has the custom medSpaCy attributes registered if not hasattr(doc._, "context_graph"): context = False if not hasattr(doc._, "sections"): sections = False ents_data = [] for target in doc.ents: ent_data = { "start": target.start_char, "end": target.end_char, "label": target.label_.upper(), } ents_data.append((ent_data, "ent")) if context: visualized_modifiers = set() for target in doc.ents: for modifier in target._.modifiers: if modifier in visualized_modifiers: continue ent_data = { "start": modifier.span.start_char, "end": modifier.span.end_char, "label": modifier.category, } ents_data.append((ent_data, "modifier")) visualized_modifiers.add(modifier) if sections: for (title, header, _) in doc._.sections: if title is None: continue ent_data = { "start": header.start_char, "end": header.end_char, "label": f"<< {title.upper()} >>", } ents_data.append((ent_data, "section")) if len(ents_data) == 0: # No data to display viz_data = [{"text": doc.text, "ents": []}] options = dict() else: ents_data = sorted(ents_data, key=lambda x: x[0]["start"]) # If colors aren't defined, generate color mappings for each entity and modifier label # And set all section titles to a light gray if colors is None: labels = set() section_titles = set() for (ent_data, ent_type) in ents_data: if ent_type in ("ent", "modifier"): labels.add(ent_data["label"]) elif ent_type == "section": section_titles.add(ent_data["label"]) colors = _create_color_mapping(labels) for title in section_titles: colors[title] = "#dee0e3" ents_display_data, _ = zip(*ents_data) viz_data = [{"text": doc.text, "ents": ents_display_data,}] options = { "colors": colors, } return displacy.render( viz_data, style="ent", manual=True, options=options, jupyter=jupyter )
show_ents(doc) for i in range(len(sentences)): doc = nlp(sentences[i]) for token in doc: print(token.text, token.lemma_, token.pos_, token.is_stop) for i in range(len(sentences)): doc = nlp(sentences[i]) for ent in doc.ents: print(ent.text, ent.label_) html = [] doc = nlp(sentences[5]) html = displacy.render(doc, style="ent", jupyter=False) #NER html #View in HTML Viewer # Write a function to display basic entity info: import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag def preprocess(sent): sent = nltk.word_tokenize(sent) return sent
def display_dep(doc): displacy.render(doc, style="dep")
def train_ner(self, train_data, model=None, new_model_name="german_modified", output_dir=None, n_iter=30, labels=None, test_model=False): """Set up the pipeline and entity recognizer, and train the new entity.""" # training data format: # TRAIN_DATA = [ # ( # "Horses are too tall and they pretend to care about your feelings", # {"entities": [(0, 6, LABEL)]}, # ), # ("Do they bite?", {"entities": []}), # ] TRAIN_DATA = train_data random.seed(0) # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") [ner.add_label(label) for label in labels] # add new entity label to entity recognizer optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] train_losses = [] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) # print("Losses", losses) train_losses.append(losses) # test the trained model test_text = "Do you like horses?" doc = nlp(test_text) print("Entities in '%s'" % test_text) displacy.render(doc, style='ent', jupyter=True) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to: ", output_dir) if test_model: # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text) return train_losses