Ejemplo n.º 1
0
from scattertext.termcompaction.AssociationCompactor import JSDCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, RankDifference
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact(
        JSDCompactor(1000))

term_etc_df = corpus.get_term_freq_df('').assign(
    DemocraticRank=lambda df: dense_rank(df['democrat']),
    RepublicanRank=lambda df: dense_rank(df['republican']),
    RankDiff=lambda df: RankDifference().get_scores(df['democrat'], df[
        'republican']),
)

get_custom_term_html = '(function(x) {return "Term: " + x.term + "<span class=topic_preview>"' + ' '.join(
    f''' + "<br>{name}: " + x.etc.{key}.toFixed(5)'''
    for name, key in [('Democratic Rank', 'DemocraticRank'),
                      ('Republican Rank',
                       'RepublicanRank'), ('Rank Difference Score',
                                           'RankDiff')]) + '+ "</span>" ;})'

html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
Ejemplo n.º 2
0
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()

def scale(ar):
	return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.


frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
                                 LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)
Ejemplo n.º 3
0
from scattertext.termscoring.DeltaJSDivergence import DeltaJSDivergence

from scattertext.termcompaction.AssociationCompactor import JSDCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact(
        JSDCompactor(1000))

html = produce_frequency_explorer(corpus,
                                  category='democrat',
                                  category_name='Democratic',
                                  not_category_name='Republican',
                                  minimum_term_frequency=0,
                                  pmi_threshold_coefficient=0,
                                  width_in_pixels=1000,
                                  metadata=convention_df['speaker'],
                                  term_scorer=DeltaJSDivergence(),
                                  transform=dense_rank,
                                  term_metadata_df=corpus.get_term_freq_df(''),
                                  enable_term_category_description=False)

open('./demo_JSDivergence.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_JSDivergence.html in Chrome or Firefox.')