def empath(self, dataframe): feat_builder = st.FeatsFromOnlyEmpath() empath_corpus = st.CorpusFromParsedDocuments( dataframe, category_col='Document Type', feats_from_spacy_doc=feat_builder, parsed_col='Text').build() html = st.produce_scattertext_explorer( empath_corpus, category='submission', category_name='Submission', not_category_name='Standard', width_in_pixels=1000, metadata=dataframe['Document'], use_non_text_features=True, use_full_doc=True, topic_model_term_lists=feat_builder.get_top_model_term_lists()) logger.getLogger().info("Opening Empath Visual") open(self.empath_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.empath_file)
df_1 = df.groupby( [ "author"] ).size().reset_index(name='Counts') df_1 = df_1.sort_values(by=['Counts'], ascending=False) df_1 = df_1.head(2) df_2 = df_1.merge(df, on='author', how='inner') df_2 = df_2.sort_values(by=['Counts'], ascending=False) build_corpus = st.CorpusFromPandas(df_2, category_col='author', text_col='clean_article_text', nlp=nlp).build() df_freq = build_corpus.get_term_freq_df() df_freq['GLOBE EDITORIAL SCORE'] = build_corpus.get_scaled_f_scores('GLOBE EDITORIAL') df_freq['Jeffrey Simpson Score'] = build_corpus.get_scaled_f_scores('Jeffrey Simpson') html = st.produce_scattertext_explorer(build_corpus, category='GLOBE EDITORIAL', category_name='GLOBE EDITORIAL', not_category_name='Jeffrey Simpson', width_in_pixels=1000, metadata=df_2['author']) open("../output/Top_2_Authors.html", 'wb').write(html.encode('utf-8')) #visualizing Empath topics and categories instead of terms build_feats = st.FeatsFromOnlyEmpath() build_corpus_2 = st.CorpusFromParsedDocuments(df_2,category_col='author', feats_from_spacy_doc=build_feats, parsed_col='clean_article_text').build() html = st.produce_scattertext_explorer(build_corpus_2,category='GLOBE EDITORIAL',category_name='GLOBE EDITORIAL',not_category_name='Jeffrey Simpson',width_in_pixels=1000,metadata=df_2['author'],use_non_text_features=True,use_full_doc=True,topic_model_term_lists=build_feats.get_top_model_term_lists()) open("../output/Top_2_Authors-Empath.html", 'wb').write(html.encode('utf-8'))
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() empath_feature_builder = st.FeatsFromOnlyEmpath() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus() html = st.produce_pairplot( corpus, use_metadata=True, category_projector=st.CategoryProjector(compactor=None), topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(), metadata=convention_df['party'] + ': ' + convention_df['speaker']) file_name = 'convention_pair_plot_empath.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings plt.style.use("seaborn") warnings.filterwarnings(action='ignore') listing = pd.read_csv('data/listings.csv') listing['review_scores_ratings'] = listing['review_scores_rating'] / 20 listing['review_scores_ratings'] = pd.cut( listing['review_scores_ratings'], bins=5, labels=["0.0-1.0", "1.0-2.0", "2.0-3.0", "3.0-4.0", "4.0-5.0"]) listing['description'].replace(np.NaN, "no_description", inplace=True) listing["review_scores_ratings"].replace(np.NaN, "no_review", inplace=True) nlp = spacy.load("en") corpus = st.CorpusFromPandas(listing, category_col="review_scores_rating", text_col="description", nlp=nlp).build().remove_terms( ENGLISH_STOP_WORDS, ignore_absences=True) # html=st.produce_sc html = st.produce_scattertext_explorer(corpus, category="4.0-5.0", category_name="Star Rating 4.0 - 5.0", not_category_name="All Other Ratings", width_in_pixels=1000, minimum_term_frequency=200) open("term-associations.html", "wb").write(html.encode("utf-8")) feat_builder = st.FeatsFromOnlyEmpath()