Exemple #1
0
    def gitc(self, dataframe):
        general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()

        corpus = st.CorpusFromPandas(
            dataframe,
            category_col='Document Type',
            text_col='Text',
            nlp=st.whitespace_nlp_with_sentences,
            feats_from_spacy_doc=general_inquirer_feature_builder).build()

        html = st.produce_frequency_explorer(
            corpus,
            category='submission',
            category_name='Submission',
            not_category_name='Standard',
            use_non_text_features=True,
            use_full_doc=True,
            term_scorer=st.LogOddsRatioUninformativeDirichletPrior(),
            grey_threshold=1.96,
            width_in_pixels=1000,
            metadata=dataframe['Document'],
            topic_model_term_lists=general_inquirer_feature_builder.
            get_top_model_term_lists())

        logger.getLogger().info("Opening GITC-Visual")
        open(self.gitc_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.gitc_file)
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='category',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus()

beta_posterior = st.BetaPosterior(corpus).set_categories('fresh', ['rotten'])
score_df = beta_posterior.get_score_df()
print("Top Fresh Terms")
print(score_df.sort_values(by='cat_p').head())

print("Top Rotten Terms")
print(score_df.sort_values(by='ncat_p').head())

html = st.produce_frequency_explorer(corpus,
                                     category='fresh',
                                     not_category_name='rotten',
                                     term_scorer=beta_posterior,
                                     grey_threshold=1.96)

file_name = 'demo_beta_posterior.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)
        scores[corpus.get_terms()[feati]].append((acc - shuff_acc) / acc)
print("Features sorted by their score:")
print(
    sorted([(round(np.mean(score), 4), feat)
            for feat, score in scores.items()],
           reverse=True))

print("Features sorted by their pred diff:")
print(
    sorted([(round(np.mean(score), 4), feat)
            for feat, score in pred_diff.items()],
           reverse=True))

term_scores = pd.Series(index=corpus.get_terms())
top_terms = pd.Series(scores).apply(np.mean)
term_scores.loc[top_terms.index] = top_terms.values
term_scores = term_scores.fillna(0)

html = st.produce_frequency_explorer(corpus,
                                     category='Positive',
                                     not_categories=['Negative'],
                                     neutral_categories=['Plot'],
                                     scores=term_scores.values,
                                     metadata=movie_df['movie_name'],
                                     grey_threshold=0,
                                     show_neutral=True)

file_name = 'demo_rf.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemple #4
0
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()
corpus = st.CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=general_inquirer_feature_builder).build()
html = st.produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    metadata=convention_df['speaker'],
    use_non_text_features=True,
    use_full_doc=True,
    term_scorer=st.LogOddsRatioUninformativeDirichletPrior(),
    grey_threshold=1.96,
    width_in_pixels=1000,
    topic_model_term_lists=general_inquirer_feature_builder.
    get_top_model_term_lists(),
    metadata_descriptions=general_inquirer_feature_builder.get_definitions())
fn = 'demo_general_inquirer_frequency_plot.html'
with open(fn, 'wb') as out:
    out.write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (fn))
Exemple #5
0
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, newsgroups_train.target)

corpus = st.CorpusFromScikit(X=CountVectorizer(
    vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data),
                             y=newsgroups_train.target,
                             feature_vocabulary=vectorizer.vocabulary_,
                             category_names=newsgroups_train.target_names,
                             raw_texts=newsgroups_train.data).build()

html = st.produce_frequency_explorer(
    corpus,
    'alt.atheism',
    scores=clf.coef_[0],
    use_term_significance=False,
    terms_to_include=st.AutoTermSelector.get_selected_terms(
        corpus, clf.coef_[0]),
    metadata=[
        '/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames
    ])

file_name = "demo_sklearn.html"
open(file_name, 'wb').write(html.encode('utf-8'))
print("open " + file_name)

sfs = (corpus.get_scaled_f_scores('alt.atheism') - 0.5) * 2
html = st.produce_frequency_explorer(
    corpus,
    'alt.atheism',
    scores=sfs,
    use_term_significance=False,
Exemple #6
0
from scattertext.termscoring.DeltaJSDivergence import DeltaJSDivergence

from scattertext.termcompaction.AssociationCompactor import JSDCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact(
        JSDCompactor(1000))

html = produce_frequency_explorer(corpus,
                                  category='democrat',
                                  category_name='Democratic',
                                  not_category_name='Republican',
                                  minimum_term_frequency=0,
                                  pmi_threshold_coefficient=0,
                                  width_in_pixels=1000,
                                  metadata=convention_df['speaker'],
                                  term_scorer=DeltaJSDivergence(),
                                  transform=dense_rank,
                                  term_metadata_df=corpus.get_term_freq_df(''),
                                  enable_term_category_description=False)

open('./demo_JSDivergence.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_JSDivergence.html in Chrome or Firefox.')
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, CohensD, produce_frequency_explorer, \
    OncePerDocFrequencyRanker
from scattertext.termcompaction.ClassPercentageCompactor import ClassPercentageCompactor
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termranking import ClassBalancedFrequencyRanker
from scattertext.termscoring.ScaledFScore import ScaledFScorePresets

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().compact(
        ClassPercentageCompactor(term_ranker=OncePerDocFrequencyRanker)))

html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    term_scorer=CohensD(corpus).set_term_ranker(
        ClassBalancedFrequencyRanker).set_categories('democrat',
                                                     ['republican']),
    metadata=convention_df['speaker'],
    grey_threshold=0,
    show_neutral=True)
file_name = 'demo_cohens_d.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./demo_cohens_d.html in Chrome or Firefox.')
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category\
	.apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
	movie_df,
	category_col='category',
	text_col='text',
	nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()

term_scorer = (st.RelativeEntropy(corpus)
               .set_categories('Positive', ['Negative'], ['Plot']))

html = st.produce_frequency_explorer(
	corpus,
	category='Positive',
	not_categories=['Negative'],
	neutral_categories=['Plot'],
	term_scorer=term_scorer,
	metadata=movie_df['movie_name'],
	grey_threshold=0,
	show_neutral=True
)
file_name = 'demo_relative_entropy.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemple #9
0
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
corpus = (st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus())
html = st.produce_frequency_explorer(corpus,
                                     category='BARACK OBAMA',
                                     term_scorer=st.ScaledFScorePresets(
                                         one_to_neg_one=True,
                                         use_score_difference=True),
                                     metadata=convention_df['speaker'],
                                     grey_threshold=0)
file_name = 'demo_obama.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
Exemple #10
0
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
corpus = (st.CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus())
term_scorer = st.CredTFIDF(corpus, use_l2_norm=False,
                           use_cred=False).set_categories(
                               'democrat', ['republican'])
print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf',
                                             ascending=False).head())
html = st.produce_frequency_explorer(corpus,
                                     category='democrat',
                                     category_name='Democratic',
                                     not_category_name='Republican',
                                     term_scorer=term_scorer,
                                     metadata=convention_df['speaker'],
                                     grey_threshold=0,
                                     include_all_contexts=True)
file_name = 'demo_include_all_contexts.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)
    for name, key in [('Democratic Rank', 'DemocraticRank'),
                      ('Republican Rank',
                       'RepublicanRank'), ('Rank Difference Score',
                                           'RankDiff')]) + '+ "</span>" ;})'

html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    metadata=convention_df['speaker'],
    term_scorer=DeltaJSDivergence(),
    transform=dense_rank,
    term_metadata_df=term_etc_df,
    get_custom_term_html=get_custom_term_html,
    enable_term_category_description=False,
    header_names={
        'upper': 'Top Dem. RankDiff',
        'lower': 'Top GOP RankDiff'
    },
    header_sorting_algos={
        'upper': '(function(a, b) {return b.etc.RankDiff - a.etc.RankDiff})',
        'lower': '(function(a, b) {return a.etc.RankDiff - b.etc.RankDiff})'
    })

open('./demo_JSDivergence.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_JSDivergence.html in Chrome or Firefox.')
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           nlp=whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus())
html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    term_scorer=HedgesR(corpus),
    metadata=convention_df['speaker'],
    grey_threshold=0
)
file_name = 'demo_hedges_r.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()
corpus = st.CorpusFromPandas(convention_df,
                             category_col='party',
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences,
                             feats_from_spacy_doc=general_inquirer_feature_builder).build()
html = st.produce_frequency_explorer(corpus,
                                     category='democrat',
                                     category_name='Democratic',
                                     not_category_name='Republican',
                                     metadata=convention_df['speaker'],
                                     use_non_text_features=True,
                                     use_full_doc=True,
                                     term_scorer=st.LogOddsRatioUninformativeDirichletPrior(),
                                     grey_threshold=1.96,
                                     width_in_pixels=1000,
                                     topic_model_term_lists=general_inquirer_feature_builder.get_top_model_term_lists(),
                                     metadata_descriptions=general_inquirer_feature_builder.get_definitions())
fn = 'demo_general_inquirer_frequency_plot.html'
with open(fn, 'wb') as out:
    out.write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (fn))
from scattertext.termcompaction.CompactTerms import CompactTerms

import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'demo_log_odds_ratio_prior.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1)
          #.use_general_term_frequencies()
          .use_all_categories()
          .get_priors())
(open(fn, 'wb')
	.write(
	st.produce_frequency_explorer(
		corpus,
		category='fresh',
		not_categories=['rotten'],
		metadata=df['movie_name'],
		term_scorer=LogOddsRatioInformativeDirichletPrior(priors, 1),
	).encode('utf-8'))
)
print(fn)
Exemple #15
0
    convention_df,
    category_col='party',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=moral_foundations_feats).build()

cohens_d_scorer = st.CohensD(corpus).use_metadata()
term_scorer = cohens_d_scorer.set_categories('democrat', ['republican'])
mfd_df = term_scorer.get_score_df()
print(mfd_df.head())
mfd_df.to_csv('demo_moral_foundations.csv')
print('See demo_moral_foundations.csv for the output.')

html = st.produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    metadata=convention_df['speaker'],
    use_non_text_features=True,
    use_full_doc=True,
    term_scorer=st.CohensD(corpus).use_metadata(),
    grey_threshold=0,
    width_in_pixels=1000,
    topic_model_term_lists=moral_foundations_feats.get_top_model_term_lists(),
    metadata_descriptions=moral_foundations_feats.get_definitions())
fn = 'demo_moral_foundations.html'
with open(fn, 'wb') as out:
    out.write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (fn))
Exemple #16
0
# 	                                      scaler_algo='normcdf'
#                                       ),
#                                   grey_threshold=0,
#                                   y_axis_values=[-1, 0, 1],
#                                   metadata=convention_df['speaker'])
# fn = './demo_scaled_f_score.html'
# open(fn, 'wb').write(html.encode('utf-8'))
# print('Open ' + fn + ' in Chrome or Firefox.')

# ================================================================================
all_satisfaction_score_comment_in_all_conds=utils_data.get_all_satisfaction_score_comment_in_all_conds()

columns=['senti_on_Metfor_oral','feature','review']
all_satisfaction_score_comment_in_all_conds_df=pd.DataFrame(all_satisfaction_score_comment_in_all_conds,index=None,columns=columns)
# print("all_satisfaction_score_comment_in_all_conds_df",all_satisfaction_score_comment_in_all_conds_df)

# ================================================================================
corpus=CorpusFromPandas(
    all_satisfaction_score_comment_in_all_conds_df,category_col='senti_on_Metfor_oral',text_col='review',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus()

# ================================================================================
html=produce_frequency_explorer(
    corpus,category='negative',category_name='Negative',not_category_name='Positive',minimum_term_frequency=5,
    width_in_pixels=1000,term_scorer=ScaledFScorePresetsNeg1To1(beta=1,scaler_algo='normcdf'),
    grey_threshold=0,y_axis_values=[-1,0,1],metadata=all_satisfaction_score_comment_in_all_conds_df['feature'])

# ================================================================================
fn = '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/demo_scaled_f_score.html'
open(fn,'wb').write(html.encode('utf-8'))
print('Open ' + fn + ' in Chrome or Firefox.')
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.ScaledFScore import ScaledFScorePresetsNeg1To1

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build()
html = produce_frequency_explorer(corpus,
                                  category='democrat',
                                  category_name='Democratic',
                                  not_category_name='Republican',
                                  minimum_term_frequency=5,
                                  width_in_pixels=1000,
                                  term_scorer=ScaledFScorePresetsNeg1To1(
                                      beta=1, scaler_algo='normcdf'),
                                  grey_threshold=0,
                                  y_axis_values=[-1, 0, 1],
                                  metadata=convention_df['speaker'])
fn = './demo_scaled_f_score.html'
open(fn, 'wb').write(html.encode('utf-8'))
print('Open ' + fn + ' in Chrome or Firefox.')
Exemple #18
0
    grey_threshold=0
)
file_name = 'demo_mann_whitney.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)
'''

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

corpus = st.CorpusFromPandas(movie_df,
                             category_col='category',
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences).build()
corpus = corpus.get_unigram_corpus()

score_df = st.MannWhitneyU(corpus).set_categories(
    'plot', ['fresh', 'rotten']).get_score_df('fdr_bh')

print(score_df.sort_values(by='mwu_z', ascending=False).head())
print(score_df.sort_values(by='mwu_z', ascending=False).tail())

html = st.produce_frequency_explorer(corpus,
                                     category='plot',
                                     y_label='Mann Whitney FDR-BH Z',
                                     scores=score_df.mwu_z,
                                     grey_threshold=0)

file_name = 'demo_mann_whitney.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, newsgroups_train.target)

corpus = st.CorpusFromScikit(
	X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data),
	y=newsgroups_train.target,
	feature_vocabulary=vectorizer.vocabulary_,
	category_names=newsgroups_train.target_names,
	raw_texts=newsgroups_train.data
).build()

html = st.produce_frequency_explorer(
	corpus,
	'alt.atheism',
	scores=clf.coef_[0],
	use_term_significance=False,
	terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, clf.coef_[0]),
	metadata = ['/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames]
)

file_name = "demo_sklearn.html"
open(file_name, 'wb').write(html.encode('utf-8'))
print("open " + file_name)

sfs = (corpus.get_scaled_f_scores('alt.atheism') - 0.5) * 2
html = st.produce_frequency_explorer(
	corpus,
	'alt.atheism',
	scores=sfs,
	use_term_significance=False,
	terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs),
Exemple #20
0
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus())
html = produce_frequency_explorer(corpus,
                                  category='democrat',
                                  category_name='Democratic',
                                  not_category_name='Republican',
                                  term_scorer=HedgesR(corpus),
                                  metadata=convention_df['speaker'],
                                  grey_threshold=0)
file_name = 'demo_hedges_r.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

corpus = st.CorpusFromPandas(movie_df,
                             category_col='category',
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences).build(
                             ).get_unigram_corpus().remove_categories(['plot'])

term_scorer = st.CredTFIDF(corpus).set_categories('fresh', ['rotten'])

print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf',
                                             ascending=False).head())

html = st.produce_frequency_explorer(corpus,
                                     category='fresh',
                                     not_category_name='rotten',
                                     term_scorer=term_scorer,
                                     metadata=corpus.get_df()['movie_name'],
                                     grey_threshold=0)
file_name = 'demo_cred_tfidf.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)