Esempio n. 1
0
    def empath(self, dataframe):
        feat_builder = st.FeatsFromOnlyEmpath()
        empath_corpus = st.CorpusFromParsedDocuments(
            dataframe,
            category_col='Document Type',
            feats_from_spacy_doc=feat_builder,
            parsed_col='Text').build()

        html = st.produce_scattertext_explorer(
            empath_corpus,
            category='submission',
            category_name='Submission',
            not_category_name='Standard',
            width_in_pixels=1000,
            metadata=dataframe['Document'],
            use_non_text_features=True,
            use_full_doc=True,
            topic_model_term_lists=feat_builder.get_top_model_term_lists())

        logger.getLogger().info("Opening Empath Visual")
        open(self.empath_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.empath_file)
Esempio n. 2
0
df_1 = df.groupby( [ "author"] ).size().reset_index(name='Counts')
df_1 = df_1.sort_values(by=['Counts'], ascending=False)
df_1 = df_1.head(2)


df_2 = df_1.merge(df, on='author', how='inner')

df_2 = df_2.sort_values(by=['Counts'], ascending=False)

build_corpus = st.CorpusFromPandas(df_2, category_col='author', text_col='clean_article_text', nlp=nlp).build()
df_freq = build_corpus.get_term_freq_df()
df_freq['GLOBE EDITORIAL SCORE'] = build_corpus.get_scaled_f_scores('GLOBE EDITORIAL')
df_freq['Jeffrey Simpson Score'] = build_corpus.get_scaled_f_scores('Jeffrey Simpson')

html = st.produce_scattertext_explorer(build_corpus,
          category='GLOBE EDITORIAL',
          category_name='GLOBE EDITORIAL',
          not_category_name='Jeffrey Simpson',
          width_in_pixels=1000,
          metadata=df_2['author'])


open("../output/Top_2_Authors.html", 'wb').write(html.encode('utf-8'))

#visualizing Empath topics and categories instead of terms

build_feats = st.FeatsFromOnlyEmpath()
build_corpus_2 = st.CorpusFromParsedDocuments(df_2,category_col='author', feats_from_spacy_doc=build_feats, parsed_col='clean_article_text').build()
html = st.produce_scattertext_explorer(build_corpus_2,category='GLOBE EDITORIAL',category_name='GLOBE EDITORIAL',not_category_name='Jeffrey Simpson',width_in_pixels=1000,metadata=df_2['author'],use_non_text_features=True,use_full_doc=True,topic_model_term_lists=build_feats.get_top_model_term_lists())

open("../output/Top_2_Authors-Empath.html", 'wb').write(html.encode('utf-8'))
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
empath_feature_builder = st.FeatsFromOnlyEmpath()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    use_metadata=True,
    category_projector=st.CategoryProjector(compactor=None),
    topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(),
    metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_empath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Esempio n. 4
0
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

plt.style.use("seaborn")
warnings.filterwarnings(action='ignore')
listing = pd.read_csv('data/listings.csv')
listing['review_scores_ratings'] = listing['review_scores_rating'] / 20
listing['review_scores_ratings'] = pd.cut(
    listing['review_scores_ratings'],
    bins=5,
    labels=["0.0-1.0", "1.0-2.0", "2.0-3.0", "3.0-4.0", "4.0-5.0"])
listing['description'].replace(np.NaN, "no_description", inplace=True)
listing["review_scores_ratings"].replace(np.NaN, "no_review", inplace=True)
nlp = spacy.load("en")
corpus = st.CorpusFromPandas(listing,
                             category_col="review_scores_rating",
                             text_col="description",
                             nlp=nlp).build().remove_terms(
                                 ENGLISH_STOP_WORDS, ignore_absences=True)
# html=st.produce_sc
html = st.produce_scattertext_explorer(corpus,
                                       category="4.0-5.0",
                                       category_name="Star Rating 4.0 - 5.0",
                                       not_category_name="All Other Ratings",
                                       width_in_pixels=1000,
                                       minimum_term_frequency=200)
open("term-associations.html", "wb").write(html.encode("utf-8"))
feat_builder = st.FeatsFromOnlyEmpath()