import scattertext.interface.ProduceScattertextExplorer from scattertext import RankDifference convention_df = st.SampleCorpora.ConventionData2012.get_data() convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences) unigram_corpus = (st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parse') .build().get_stoplisted_unigram_corpus()) topic_model = (st.SentencesForTopicModeling(unigram_corpus) .get_topics_from_terms(['obama', 'romney', 'democrats', 'republicans', 'health', 'military', 'taxes', 'education', 'olympics', 'auto', 'iraq', 'iran', 'israel'], scorer=RankDifference(), num_terms_per_topic=20)) topic_feature_builder = st.FeatsFromTopicModel(topic_model) topic_corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=topic_feature_builder ).build() html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer( topic_corpus, category='democrat', category_name='Democratic', not_category_name='Republican',
convention_df = st.SampleCorpora.ConventionData2012.get_data() convention_df['parse'] = convention_df['text'].apply( st.whitespace_nlp_with_sentences) unigram_corpus = (st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse').build().get_stoplisted_unigram_corpus()) topic_model = ( st.SentencesForTopicModeling(unigram_corpus).get_topics_from_terms( [ 'obama', 'romney', 'democrats', 'republicans', 'health', 'military', 'taxes', 'education', 'olympics', 'auto', 'iraq', 'iran', 'israel' ], scorer=RankDifference(), num_terms_per_topic=20)) topic_feature_builder = st.FeatsFromTopicModel(topic_model) topic_corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=topic_feature_builder).build() html = st.produce_scattertext_explorer( topic_corpus, category='democrat', category_name='Democratic', not_category_name='Republican',
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, RankDifference from scattertext.CorpusFromPandas import CorpusFromPandas convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact( JSDCompactor(1000)) term_etc_df = corpus.get_term_freq_df('').assign( DemocraticRank=lambda df: dense_rank(df['democrat']), RepublicanRank=lambda df: dense_rank(df['republican']), RankDiff=lambda df: RankDifference().get_scores(df['democrat'], df[ 'republican']), ) get_custom_term_html = '(function(x) {return "Term: " + x.term + "<span class=topic_preview>"' + ' '.join( f''' + "<br>{name}: " + x.etc.{key}.toFixed(5)''' for name, key in [('Democratic Rank', 'DemocraticRank'), ('Republican Rank', 'RepublicanRank'), ('Rank Difference Score', 'RankDiff')]) + '+ "</span>" ;})' html = produce_frequency_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0,