def create_scatter_text(writers, names, messages, nonames=False): my_df = pd.DataFrame({"author": names, "message": messages}) nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer()) my_df['parse'] = my_df['message'].apply(nlp) corpus = st.CorpusFromParsedDocuments( my_df, category_col='author', parsed_col='parse').build().get_unigram_corpus().compact( st.AssociationCompactor(2000)) if nonames: html = st.produce_scattertext_explorer(corpus, category=writers[0], category_name="Author_0", not_category_name="Author_1", minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, transform=st.Scalers.dense_rank) else: html = st.produce_scattertext_explorer(corpus, category=writers[0], category_name=writers[0], not_category_name=writers[1], minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, transform=st.Scalers.dense_rank) with open('./demo_compact.html', 'w') as f: f.write(html) f.close()
def main(): shisei = _parse_geutenberg( 'http://www.gutenberg.org/files/31617/31617-0.txt') horadanshaku = _parse_geutenberg( 'http://www.gutenberg.org/files/34084/34084-0.txt') df = pd.DataFrame({ 'text': [shisei, horadanshaku], 'title': ['Shisei', 'Horadanshaku tabimiyage'], 'author': ['Akutagawa Ryunosuke', 'Kuni Sasaki'] }) df['text'] = df['text'].apply(st.japanese_nlp) corpus = st.CorpusFromParsedDocuments(df, category_col='title', parsed_col='text').build() html = st.produce_scattertext_explorer( corpus, category='Shisei', category_name='Shisei', not_category_name='Horadanshaku tabimiyage', minimum_term_frequency=5, width_in_pixels=1000, metadata=df['title'] + ' by ' + df['author'], asian_mode=True) open('./demo_japanese.html', 'w').write(html) print('Open ./demo_japanese.html in Chrome or Firefox.')
def print_graph(corpus, speeches_df, category, type, not_type): """ :param corpus: :param speeches_df: :param category: :param type: :param not_type: :return: produces html file with corpus visualization """ if type == "1": type = "Dictatorship" not_type = "Democracy" if type == "2": type = "Democracy" not_type = "Dictatorship" #if type == "Francisco Franco": # not_type = "Borbones" html = st.produce_scattertext_explorer(corpus, category=type, category_name=type, not_category_name=not_type, width_in_pixels=1000, metadata=speeches_df[category]) open("./visualization/visualization_" + type + ".html", 'wb').write(html.encode('utf-8'))
def create_scattertext_plot(df, category_col:str, text_col:str, nlp, filename:str, label_match:str, label_name:str, label_other_name:str, metadata_col:str, **kwargs): """ creates a html file with an interactive scattertext plot Will delete an 'index' column if there is one as the scattertext function needs to create it label_match must be one of 2 entries in the category_col label_name is the user-friendly name given to a match, e.g. if label_match is 'Yes', you might want a more meaningful label such as 'A good week' label_other_name is the label for the other entry - e.g. 'A bad week' **kwargs goes into scattertext.produce_scattertext_explorer, e.g. minimum_term_frequency=8, :returns: nothing, but creates a HTML file""" if 'index' in df.columns: df.drop('index',axis=1,inplace=True) corpus = st.CorpusFromPandas(df,category_col=category_col,text_col=text_col, nlp=nlp).build() html = st.produce_scattertext_explorer(corpus, category=label_match, category_name=label_name, not_category_name=label_other_name, metadata=corpus.get_df()[metadata_col], save_svg_button=True, **kwargs ) html_file = open(filename, 'wb') html_file.write(html.encode('utf-8')) html_file.close()
def scattertext_function(self): ## START nlp = spacy.load('en_core_web_sm') convention_df = pd.read_csv( "After_Classification/After_Classification_NY_6.csv") convention_df['parsed'] = convention_df.tweet.apply(nlp) ##Index(['Unnamed: 0', 'Date', 'name', 'tweet', 'death', 'Classification'], dtype='object') # print("Document Count") # print(convention_df.groupby('Classification')['tweet'].count()) # print("Word Count") # print(convention_df.groupby('Classification').apply(lambda x: x.tweet.apply(lambda x: len(x.split())).sum())) # print(type(convention_df)) ##Convert Dataframe into Scattertext Corpus corpus = st.CorpusFromParsedDocuments(convention_df, category_col='Classification', parsed_col='parsed').build() print(type(st.Scalers.log_scale_standardize)) list(corpus.get_scaled_f_scores_vs_background().index[:10]) html = st.produce_scattertext_explorer( corpus, category='pos', category_name='POS', not_category_name='NEG', minimum_term_frequency=5, width_in_pixels=1000, transform=st.Scalers.log_scale_standardize) file_name_1 = 'After_Classification_NY_6.html' open(file_name_1, 'wb').write(html.encode('utf-8')) print(IFrame(src=file_name_1, width=1200, height=700))
def create_html_with_two_categories(self, category_one, category_two, name_of_file): html = st.produce_scattertext_explorer(self.term_cat_freq, category=category_one, category_name=category_one, not_category_name=category_two) open(name_of_file, 'wb').write(html.encode('utf-8')) self.name_of_file = name_of_file
def main(): # convention_df = SampleCorpora.ConventionData2012.get_data() feat_builder = FeatsFromOnlyEmpath() # corpus = CorpusFromParsedDocuments(convention_df, # category_col='party', # parsed_col='text', # feats_from_spacy_doc=feat_builder).build() # html = produce_scattertext_explorer(corpus, # category='democrat', # category_name='Democratic', # not_category_name='Republican', # width_in_pixels=1000, # metadata=convention_df['speaker'], # use_non_text_features=True, # use_full_doc=True, # topic_model_term_lists=feat_builder.get_top_model_term_lists()) # ================================================================================ all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds( ) # print("all_satisfaction_score_comment_in_all_conds",all_satisfaction_score_comment_in_all_conds) # [['negative', 'Satisfaction', 'after a week----mouth ulccers,cudnt talk,eat,drink for 5 days....whole body burnt,headache, fatigue....quit---am slowly getting better, wudnt give to my worst # print("all_satisfaction_score_comment_in_all_conds",len(all_satisfaction_score_comment_in_all_conds)) # 1402 # ================================================================================ columns = ['senti_on_Metfor_oral', 'feature', 'review'] all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame( all_satisfaction_score_comment_in_all_conds, index=None, columns=columns) # ================================================================================ corpus = CorpusFromParsedDocuments( all_satisfaction_score_comment_in_all_conds_df, category_col='senti_on_Metfor_oral', parsed_col='review', feats_from_spacy_doc=feat_builder).build() # ================================================================================ html = produce_scattertext_explorer( corpus, category='negative', category_name='Negative', not_category_name='Positive', width_in_pixels=1000, metadata=all_satisfaction_score_comment_in_all_conds_df['feature'], use_non_text_features=True, use_full_doc=True, topic_model_term_lists=feat_builder.get_top_model_term_lists()) # ================================================================================ open( '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8')) print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
def vis(): ''' text1 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh.en.txt", "r").read() text2 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh_online.en.txt", "r").read() df = pd.DataFrame( [{'text': text.strip(), 'label': 'text1'} for text in text1.decode('utf-8', errors='ignore').split('\n')] + [{'text': text.strip(), 'label': 'text2'} for text in text2.decode('utf-8', errors='ignore').split('\n')] ) term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, category_col = 'label', text_col = 'text', nlp = ST.whitespace_nlp ).build() filtered_term_doc_mat = (ST.TermDocMatrixFilter(pmi_threshold_coef = 1, minimum_term_freq = 1).filter(term_doc_mat)) scatter_chart_data = (ST.ScatterChart(filtered_term_doc_mat).to_dict('text1', category_name='text1', not_category_name='text2')) viz_data_adapter = ST.viz.VizDataAdapter(scatter_chart_data) html = ST.viz.HTMLVisualizationAssembly(viz_data_adapter).to_html() open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8')) IFrame(src='subj_obj_scatter.html', width = 1000, height=1000) ''' SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz' data = io.BytesIO(urllib.urlopen(SUBJECTIVITY_URL).read()) tarball = tarfile.open(fileobj=data, mode='r:gz') readme = tarball.extractfile('subjdata.README.1.0').read() quote = tarball.extractfile('quote.tok.gt9.5000').read() plot = tarball.extractfile('plot.tok.gt9.5000').read() text1 = open("tmp/flickr_test_1k_zh.en.txt", "r").read() text2 = open("tmp/flickr_test_1k_zh.en.txt", "r").read() # Examples of subjective sentences in corpus #quote.decode('utf-8', errors='ignore').split('\n')[:3] '''Construct subjective vs. objective pandas dataframe, treating review quotes as subjective, and plot points as objective. ''' df = pd.DataFrame( [{ 'text': text.strip(), 'label': 'subjective' } for text in quote.decode('utf-8', errors='ignore').split('\n')] + [{ 'text': text.strip(), 'label': 'objective' } for text in plot.decode('utf-8', errors='ignore').split('\n')]) '''Convert Pandas dataframe to a term-document matrix, indicating the category column is "label" and the text column name is "text".''' nlp = spacy.load('en') corpus = ST.CorpusFromPandas( data_frame=df, category_col='label', text_col='text', # Note: use nlp=spacy.en.English() for text that's not pre-tokenized nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() html = ST.produce_scattertext_explorer(corpus, category='label', category_name='subjective', not_category_name='objective', width_in_pixels=1000) open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))
def print_graph(corpus, speeches_df, category, type, not_type): """ :param corpus: :param speeches_df: :param category: :param type: :param not_type: :return: produces html file with corpus visualization """ html = st.produce_scattertext_explorer(corpus, category=type, category_name=type, not_category_name=not_type, width_in_pixels=1000, metadata=speeches_df[category]) open("./visualization/visualization_" + type + ".html", 'wb').write(html.encode('utf-8'))
def get_sct_html(rest_name, city_name): rest_reviews = get_rest_reviews(rest_name, city_name) nlp = spacy.load('en_core_web_sm') corpus = sct.CorpusFromPandas(rest_reviews, category_col='class', text_col='text', nlp=nlp).build() html = sct.produce_scattertext_explorer(corpus, category='good', category_name='Positive', not_category_name='Negative', width_in_pixels=900, metadata=rest_reviews['class']) return open("rest_reviews-Vis.html", 'wb').write(html.encode('utf-8'))
def create_scatterplot(df, return_corpus=False): '''Creates an HTML file to visualize differences in corpora.''' corpus = st.CorpusFromPandas(df, category_col='author', text_col='text', nlp=nlp).build() if return_corpus: return corpus html = st.produce_scattertext_explorer(corpus, category='EAP', category_name='Edger Allen Poe', not_category_name='HPL/MWS', width_in_pixels=1000, metadata=df['author']) open("Author-Visualization.html", 'wb').write(html.encode('utf-8'))
def standard(self, dataframe): corpus = st.CorpusFromPandas(dataframe, category_col='Document Type', text_col='Text', nlp=self.nlp).build() html = st.produce_scattertext_explorer(corpus, category='1st Document', category_name='1st Document', not_category_name='2nd Document', width_in_pixels=1000) logger.getLogger().info("Opening Standard Visual") open(self.std_file, 'wb').write(html.encode('utf-8')) if os.path.isfile(self.std_file): logger.getLogger().info("Graph file created")
def main(): df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv') df['text'] = df['text'].apply(chinese_nlp) corpus = CorpusFromParsedDocuments(df, category_col='novel', parsed_col='text').build() html = produce_scattertext_explorer(corpus, category='Tale of Two Cities', category_name='Tale of Two Cities', not_category_name='Ulysses', width_in_pixels=1000, metadata=df['novel'], asian_mode=True) open('./demo_chinese.html', 'w').write(html) print('Open ./demo_chinese.html in Chrome or Firefox.')
def main(): df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv') df['text'] = df['text'].apply(chinese_nlp) corpus = CorpusFromParsedDocuments(df, category_col='novel', parsed_col='text').build() html = produce_scattertext_explorer(corpus, category='Tale of Two Cities', category_name='Tale of Two Cities', not_category_name='Ulysses', width_in_pixels=1000, metadata=df['novel'], chinese_mode=True) open('./demo_chinese.html', 'w').write(html) print('Open ./demo_chinese.html in Chrome or Firefox.')
def standard(self, dataframe): corpus = st.CorpusFromPandas(dataframe, category_col='Document Type', text_col='Text', nlp=self.nlp).build() html = st.produce_scattertext_explorer(corpus, category='submission', category_name='Submission', not_category_name='Standard', width_in_pixels=1000, metadata=dataframe['Document']) logger.getLogger().info("Opening Standard Visual") open(self.std_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.std_file)
def main(): convention_df = SampleCorpora.ConventionData2012.get_data() feat_builder = FeatsFromOnlyEmpath() corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='text', feats_from_spacy_doc=feat_builder).build() html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, topic_model_term_lists=feat_builder.get_top_model_term_lists()) open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8')) print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
def main(): convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='text', feats_from_spacy_doc=FeatsFromOnlyEmpath()).build() html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True) open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8')) print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
def main(): convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences, feats_from_spacy_doc=FeatsFromGeneralInquirer()).build() html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True) open('./demo_general_inquirer.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_general_inquirer.html in Chrome or Firefox.')
def main(): shisei = _parse_geutenberg('http://www.gutenberg.org/files/31617/31617-0.txt') horadanshaku = _parse_geutenberg('http://www.gutenberg.org/files/34084/34084-0.txt') df = pd.DataFrame({'text': [shisei, horadanshaku], 'title': ['Shisei', 'Horadanshaku tabimiyage'], 'author': ['Akutagawa Ryunosuke', 'Kuni Sasaki']}) df['text'] = df['text'].apply(st.japanese_nlp) corpus = st.CorpusFromParsedDocuments(df, category_col='title', parsed_col='text').build() html = st.produce_scattertext_explorer(corpus, category='Shisei', category_name='Shisei', not_category_name='Horadanshaku tabimiyage', minimum_term_frequency=5, width_in_pixels=1000, metadata=df['title'] + ' by ' + df['author'], asian_mode=True) open('./demo_japanese.html', 'w').write(html) print('Open ./demo_japanese.html in Chrome or Firefox.')
def main(): convention_df = SampleCorpora.ConventionData2012.get_data() feat_builder = FeatsFromGeneralInquirer() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences, feats_from_spacy_doc=feat_builder).build() html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, topic_model_term_lists=feat_builder.get_top_model_term_lists(), metadata_descriptions=feat_builder.get_definitions() ) open('./demo_general_inquirer.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_general_inquirer.html in Chrome or Firefox.')
def empath(self, dataframe): feat_builder = st.FeatsFromOnlyEmpath() empath_corpus = st.CorpusFromParsedDocuments( dataframe, category_col='Document Type', feats_from_spacy_doc=feat_builder, parsed_col='Text').build() html = st.produce_scattertext_explorer( empath_corpus, category='submission', category_name='Submission', not_category_name='Standard', width_in_pixels=1000, metadata=dataframe['Document'], use_non_text_features=True, use_full_doc=True, topic_model_term_lists=feat_builder.get_top_model_term_lists()) logger.getLogger().info("Opening Empath Visual") open(self.empath_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.empath_file)
def scatterplot(df): ''' input: a dataframe with text, CEO, and quarter output: a scatterplot ''' corpus = st.CorpusFromPandas(df, category_col='ceo', text_col='text', nlp=st.whitespace_nlp_with_sentences).build() html = st.produce_scattertext_explorer( corpus, category='Ballmer', category_name='Steve Ballmer Era', not_category_name='Satya Nadella Era', minimum_term_frequency=10, pmi_threshold_coefficient=5, width_in_pixels=1000, metadata=df['quarter'], ) open('../Charts/scattertext_demo.html', 'wb').write(html.encode('utf-8'))
def scatter_viz(self): #load corpus corpus = self.create_corpus() #load cleaned df convention_df = self.clean_texts() html = produce_scattertext_explorer( corpus, category='left', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, minimum_term_frequency=1, metadata=convention_df['position'], term_significance=st.LogOddsRatioUninformativeDirichletPrior()) file_name = key_path + '/templates/scattertext.html' open(file_name, 'wb').write(html.encode('utf-8')) return file_name
def generate_visual(data, category, category_name, not_category_name, filename='index.html'): import spacy import scattertext as st nlp = spacy.load('en_core_web_sm') corpus = st.CorpusFromPandas(data, category_col='label', text_col='abstract', nlp=nlp).build() html = st.produce_scattertext_explorer(corpus, category=category, category_name=category_name, not_category_name=not_category_name, width_in_pixels=1000, metadata=data['journal']) return html
from scattertext import SampleCorpora, whitespace_nlp_with_sentences from scattertext import produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas from scattertext.termscoring.ScaledFScore import ScaledFScorePresets convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build() html = produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, pmi_threshold_coefficient=8, width_in_pixels=1000, metadata=convention_df['speaker'], #term_scorer=ScaledFScorePresets(one_to_neg_one=True, beta=1), #d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', #d3_url='scattertext/data/viz/scripts/d3.min.js', ) open('./demo.html', 'wb').write(html.encode('utf-8')) print('Open ./demo.html in Chrome or Firefox.')
import spacy from scattertext import SampleCorpora, produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas from scattertext.termscoring.LogOddsUniformativePriorScore import LogOddsUninformativePriorScore nlp = spacy.load('en_core_web_sm') convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() scores = -(LogOddsUninformativePriorScore.get_thresholded_score( term_freq_df['democrat freq'], term_freq_df['republican freq'], alpha_w=2., threshold=0.1)) html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', scores=scores, sort_by_dist=False, gray_zero_scores=True, minimum_term_frequency=5, width_in_pixels=1000, metadata=convention_df['speaker']) open('./demo_insignificant_greyed_out.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_insignificant_greyed_out.html in Chrome or Firefox.')
original_data = df.reset_index(drop=True) df = pd.concat([original_data, cleaned_texts], axis=1) df['parsed_text'] = df['parsed_text'].apply(chinese_nlp) for i in np.arange(len(df['text'])): df['text'][i] = re.sub(pattern, '', df['text'][i]) df['text'] = df['text'].apply(chinese_nlp) corpus = CorpusFromParsedDocuments(df, category_col='file_name', parsed_col='parsed_text').build() html = produce_scattertext_explorer(corpus, category='安利蛋白粉评论.txt', category_name='安利蛋白粉评论.txt', not_category_name='汤臣倍健蛋白粉评论.txt', width_in_pixels=1000, metadata=df['file_name'], asian_mode=True, alternative_text_field="text") open( 'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein_review_compare.html', 'w', encoding='utf-8').write(html) print( 'Open C:/Users/CNU074VP/Desktop/Chinese Topic Model in Chrome or Firefox.')
# Scattertext attack vs support (only responses) # https://kanoki.org/2019/03/17/text-data-visualization-in-python/ # https://github.com/JasonKessler/scattertext nlp = spacy.load('en_core_web_sm') for data_set in ['debate_test', 'debate_train', 'procon', 'political']: df_plot = df.loc[(df['org_dataset'] == data_set) & (df['label'].isin(['attack', 'support']))] df_plot['parsed'] = df_plot['response'].apply(nlp) corpus = st.CorpusFromParsedDocuments(df_plot, category_col='label', parsed_col='parsed').build() html = st.produce_scattertext_explorer( corpus, category='attack', not_category_name='support', width_in_pixels=1000, minimum_term_frequency=5, transform=st.Scalers.log_scale_standardize, use_full_doc=True) file_name = './plots/scattertext_attack_support' + data_set + '.html' with open(file_name, 'wb') as file: file.write(html.encode('utf-8')) # Scattertext Nixon vs Kennedy df_plot = data_nix_ken df_plot['parsed'] = df_plot['text'].apply(nlp) corpus = st.CorpusFromParsedDocuments(df_plot, category_col='author', parsed_col='parsed').build() html = st.produce_scattertext_explorer( corpus,
from scattertext import SampleCorpora from scattertext import produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas from scattertext.WhitespaceNLP import whitespace_nlp nlp = whitespace_nlp convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=nlp).build() html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, width_in_pixels=1000, metadata=convention_df['speaker']) open('./demo_without_spacy.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_without_spacy.html in Chrome or Firefox.')
[ 'obama', 'romney', 'democrats', 'republicans', 'health', 'military', 'taxes', 'education', 'olympics', 'auto', 'iraq', 'iran', 'israel' ], scorer=RankDifference(), num_terms_per_topic=20)) topic_feature_builder = st.FeatsFromTopicModel(topic_model) topic_corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=topic_feature_builder).build() html = st.produce_scattertext_explorer( topic_corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, pmi_threshold_coefficient=0, topic_model_term_lists=topic_feature_builder.get_top_model_term_lists()) open('./demo_word_list_topic_model.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_word_list_topic_model.html in Chrome or Firefox.')
print("Número de documentos: " + str(corpus.get_num_docs())) print("Tamanho de documentos: " + str(corpus.get_doc_lengths())) print("Número de termos: " + str(corpus.get_num_terms())) print("Palavras que diferem dos corpus comuns: ") x = corpus.get_scaled_f_scores_vs_background() print(list(x.index[0:10])) #Frequência das palavras nas classes term_freq_df = corpus.get_term_freq_df() term_freq_df['positivo'] = corpus.get_scaled_f_scores('positivo') term_freq_df['negativo'] = corpus.get_scaled_f_scores('negativo') #Ordenando a frequência das palavras para obter as mais frequentes termosPositivos = term_freq_df.sort_values(by='positivo', ascending=False) termosNegativos = term_freq_df.sort_values(by='negativo', ascending=False) print("Palavras mais frequentes entre as classes positivas: ") print(list(termosPositivos.index[0:10])) print("Palavras mais frequentes entre as classes negativas: ") print(list(termosNegativos.index[0:10])) #Função para a geração do gráfico interativo, o gráfico será gerado num arquivo html que deve ser aberto no navegador e pode demorar um pouco para carregar html = scattertext.produce_scattertext_explorer( corpus, category='positivo', #classe que ficar no eixo y category_name= 'Positivo', #Nomeando a classe apenas para visualização no gráfico not_category_name='Negativo', #Nomeando a classe do eixo x width_in_pixels=1000) #Tamanho do gráfico open("graficos.html", 'wb').write(html.encode('utf-8'))
df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?') df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])] df_mf.to_csv('emoji_data.csv', index=False) nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer()) df_mf['parse'] = df_mf['Tweet content'].apply(nlp) corpus = st.CorpusFromParsedDocuments( df_mf, parsed_col='parse', category_col='gender', feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji() ).build() html = st.produce_scattertext_explorer( corpus, category='f', category_name='Female', not_category_name='Male', use_full_doc=True, term_ranker=OncePerDocFrequencyRanker, sort_by_dist=False, metadata=(df_mf['User Name'] + ' (@' + df_mf['Nickname'] + ') ' + df_mf['Date'].astype(str)), width_in_pixels=1000 ) print('writing EmojiGender.html') open("EmojiGender.html", 'wb').write(html.encode('utf-8'))
topic_model = { 'money': ['money', 'bank', 'banks', 'finances', 'financial', 'loan', 'dollars', 'income'], 'jobs': ['jobs', 'workers', 'labor', 'employment', 'worker', 'employee', 'job'], 'patriotic': ['america', 'country', 'flag', 'americans', 'patriotism', 'patriotic'], 'family': ['mother', 'father', 'mom', 'dad', 'sister', 'brother', 'grandfather', 'grandmother', 'son', 'daughter'] } topic_feature_builder = st.FeatsFromTopicModel(topic_model) topic_corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=topic_feature_builder ).build() html = st.produce_scattertext_explorer( topic_corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, pmi_threshold_coefficient=0, topic_model_term_lists=topic_feature_builder.get_top_model_term_lists() ) open('./demo_custom_topic_model.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_custom_topic_model.html in Chrome or Firefox.')
# Join the two dataframes along the column convention_df = pd.concat([df1, df2], axis=1) # Place all text in same column and create tag for CNN or Fox convention_df = pd.melt(convention_df) convention_df = convention_df.dropna(axis=0, how='any') # Build NLP parsing for corpus English = st.whitespace_nlp_with_sentences # Parse the text and create new column with parsed values convention_df.groupby('variable').apply(lambda x: x.value.apply(lambda x: len(x.split())).sum()) convention_df['parsed'] = convention_df.value.apply(English) convention_df.iloc[:3] # Generate corpus of language from pandas dataframe corpus = st.CorpusFromPandas(convention_df, category_col='variable', text_col='value', nlp = English).build() # Output html doc for visualization ## HTML FILE MUST ALREADY EXIST IN OUTPUT FOLDER TO WRITE ON html = st.produce_scattertext_explorer(corpus, category='CNN', category_name='CNN', not_category_name='Fox', width_in_pixels=1000) file_name = 'output/Trump.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1200, height=700)
from scattertext import SampleCorpora, whitespace_nlp_with_sentences from scattertext import produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas from scattertext.termscoring.ScaledFScore import ScaledFScorePresets convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build() html = produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, pmi_threshold_coefficient=8, width_in_pixels=1000, metadata=convention_df['speaker'], #term_scorer=ScaledFScorePresets(one_to_neg_one=True, beta=1), d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js', ) open('./demo.html', 'wb').write(html.encode('utf-8')) print('Open ./demo.html in Chrome or Firefox.')
def scale(ar): return (ar - ar.min()) / (ar.max() - ar.min()) def zero_centered_scale(ar): ar[ar > 0] = scale(ar[ar > 0]) ar[ar < 0] = -scale(-ar[ar < 0]) return (ar + 1) / 2. frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values)) scores = corpus.get_logreg_coefs('democrat', LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1)) scores_scaled = zero_centered_scale(scores) html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, width_in_pixels=1000, x_coords=frequencies_scaled, y_coords=scores_scaled, scores=scores, sort_by_dist=False, metadata=convention_df['speaker'], x_label='Log frequency', y_label='L2-penalized logistic regression coef') fn = 'demo_custom_coordinates.html' open(fn, 'wb').write(html.encode('utf-8')) print('Open %s in Chrome or Firefox.' % fn)
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, RankDifference from scattertext import produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build() html = produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, pmi_threshold_coefficient=8, width_in_pixels=1000, metadata=convention_df['speaker'], term_scorer=RankDifference(), d3_scale_chromatic_url= 'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js', ) open('./demo.html', 'wb').write(html.encode('utf-8')) print('Open ./demo.html in Chrome or Firefox.')
def main(): parser = argparse.ArgumentParser(description="A primitive, incomplete commandline interface to Scattertext.") parser.add_argument('--datafile', action='store', dest='datafile', required=True, help="Path (or URL) of a CSV file with at least two columns." "Text and category column names are indicated by the --text_column" "and --category_column arguments. By default, they are 'text', and 'category'. " "Optionally, a metadata " "column (named in the --metadata argument) can be present. ") parser.add_argument('--outputfile', action='store', dest='outputfile', default="-", help="Path of HTML file on which to store visualization. Pass in - (default) for stdout.") parser.add_argument('--text_column', action='store', dest='text_column', default="text", help="Name of the text column.") parser.add_argument('--category_column', action='store', dest='category_column', default="category", help="Name of the category column.") parser.add_argument('--metadata_column', action='store', dest='metadata_column', default=None, help="Name of the category column.") parser.add_argument('--positive_category', action='store', required=True, dest='positive_category', help="Postive category. A value in category_column to be considered the positive class. " "All others will be considered negative.") parser.add_argument('--category_display_name', action='store', dest='category_display_name', default=None, help="Positive category name which will " "be used on the visualization. By default, it will just be the" "postive category value.") parser.add_argument('--not_category_display_name', action='store', default=None, dest='not_category_display_name', help="Positive category name which will " "be used on the visualization. By default, it will just be the word 'not' " "in front of the positive value.") parser.add_argument('--pmi_threshold', action='store', dest='pmi_threshold', type=int, help="2 * minimum allowable PMI value. Default 6.") parser.add_argument('--width_in_pixels', action='store', dest='width_in_pixels', type=int, default=1000, help="Width of the visualization in pixels.") parser.add_argument('--minimum_term_frequency', action='store', dest='minimum_term_frequency', type=int, default=3, help="Minimum number of times a term needs to appear. Default 3") parser.add_argument('--regex_parser', action='store_true', dest='regex_parser', default=False, help="If present, don't use spaCy for preprocessing. Instead, " "use a simple, dumb, regex.") parser.add_argument('--spacy_language_model', action='store', dest='spacy_language_model', default='en', help="If present, pick the spaCy language model to use. Default is 'en'. " "Other valid values include 'de' and 'fr'. --regex_parser will override." "Please see https://spacy.io/docs/api/language-models for moredetails") parser.add_argument('--one_use_per_doc', action='store_true', dest='one_use_per_doc', default=False, help="Only count one use per document.") args = parser.parse_args() df = pd.read_csv(args.datafile) if args.category_column not in df.columns: raise Exception("category_column (%s) must be a column name in csv. Must be one of %s" % (args.category_column, ', '.join(df.columns))) if args.text_column not in df.columns: raise Exception("text_column (%s) must be a column name in csv. Must be one of %s" % (args.text_column, ', '.join(df.columns))) if args.metadata_column is not None and args.metadata_column not in df.columns: raise Exception("metadata_column (%s) must be a column name in csv. Must be one of %s" % (args.metadata_column, ', '.join(df.columns))) if args.positive_category not in df[args.category_column].unique(): raise Exception("positive_category (%s) must be in the column ""%s"", with a case-sensitive match." % (args.positive_category, args.category_column)) if args.regex_parser: nlp = whitespace_nlp_with_sentences else: import spacy nlp = spacy.load(args.spacy_language_model) term_ranker = None if args.one_use_per_doc is True: term_ranker = OncePerDocFrequencyRanker category_display_name = args.category_display_name if category_display_name is None: category_display_name = args.positive_category not_category_display_name = args.not_category_display_name if not_category_display_name is None: not_category_display_name = 'Not ' + category_display_name corpus = CorpusFromPandas(df, category_col=args.category_column, text_col=args.text_column, nlp=nlp).build() html = produce_scattertext_explorer(corpus, category=args.positive_category, category_name=category_display_name, not_category_name=not_category_display_name, minimum_term_frequency=args.minimum_term_frequency, pmi_filter_thresold=args.pmi_threshold, width_in_pixels=args.width_in_pixels, term_ranker=term_ranker, metadata=None if args.metadata_column is None \ else df[args.metadata_column] ) if args.outputfile == '-': print(html) else: with open(args.outputfile, 'wb') as o: o.write(html.encode('utf-8'))
import scattertext as st df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse').build().get_unigram_corpus().compact( st.AssociationCompactor(2000)) html = st.produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, show_diagonal=False, max_overlapping=3, vertical_lines=0.5) open('./demo_vertical_lines.html', 'w').write(html) print('open ./demo_vertical_lines.html in Chrome')
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() corpus = (st.CorpusFromPandas(convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences) .build().get_unigram_corpus()) html = st.produce_scattertext_explorer( corpus, category='BARACK OBAMA', sort_by_dist=False, metadata=convention_df['party'] + ': ' + convention_df['speaker'], term_scorer=st.RankDifference(), transform=st.Scalers.dense_rank ) file_name = 'demo_dense_rank_difference.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (file_name))