Exemple #1
0
from sklearn.decomposition import KernelPCA, NMF
from sklearn.preprocessing import RobustScaler
from statsmodels.multivariate.pca import PCA

import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=general_inquirer_feature_builder,
).build().get_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    use_metadata=True,
    category_projector=st.CategoryProjector(compactor=None),
    topic_model_term_lists=general_inquirer_feature_builder.
    get_top_model_term_lists(),
    topic_model_preview_size=100,
    metadata_descriptions=general_inquirer_feature_builder.get_definitions(),
    metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_geninq.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemple #2
0
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA

import scattertext as st

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)

corpus = st.CorpusFromScikit(
    X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(
        newsgroups_train.data),
    y=newsgroups_train.target,
    feature_vocabulary=vectorizer.vocabulary_,
    category_names=newsgroups_train.target_names,
    raw_texts=newsgroups_train.data).build().get_unigram_corpus()

html = st.produce_category_focused_pairplot(
    corpus=corpus,
    category_projector=st.CategoryProjector(projector=PCA(10)),
    category='alt.atheism')

file_name = 'demo_pair_plot_category_focused.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
import umap
from sklearn.feature_extraction.text import TfidfTransformer

import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(movie_df,
                             category_col='movie_name',
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences).build(
                             ).get_stoplisted_unigram_corpus()

category_projection = st.CategoryProjector(projector=umap.UMAP(
    metric='cosine')).project(corpus)

html = st.produce_pairplot(
    corpus,
    # category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    category_projection=category_projection,
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    scaler=st.Scalers.scale_0_to_1,
    show_halo=True,
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url=
        'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'),
    default_to_term_comparison=False)

file_name = 'movie_pair_plot_umap.html'
from sklearn.feature_extraction.text import TfidfTransformer

import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(movie_df,
                             category_col='movie_name',
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences).build(
                             ).get_stoplisted_unigram_corpus()

category_projection = st.CategoryProjector(
    selector=None,
    normalizer=TfidfTransformer(),
    projector=umap.UMAP(min_dist=0.5, metric='cosine')).project(corpus)

html = st.produce_pairplot(
    corpus,
    # category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    category_projection=category_projection,
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    scaler=st.Scalers.scale_0_to_1,
    show_halo=False,
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url=
        'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'))

file_name = 'movie_pair_plot_umap.html'
Exemple #5
0
import scattertext as st
import scattertext.categoryprojector.pairplot

convention_df = st.SampleCorpora.ConventionData2012.get_data()
empath_feature_builder = st.FeatsFromOnlyEmpath()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus()

html = scattertext.categoryprojector.pairplot.produce_pairplot(
    corpus,
    use_metadata=True,
    category_projector=st.CategoryProjector(selector=None),
    topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(),
    metadata=convention_df['party'] + ': ' + convention_df['speaker']
)

file_name = 'convention_pair_plot_empath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemple #6
0
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projector=st.CategoryProjector(projector=phate.PHATE()),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    #scaler=st.Scalers.scale_0_to_1,
    #show_halo=False,
    #d3_url_struct=st.D3URLs(
    #    d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
    #    d3_url='scattertext/data/viz/scripts/d3.min.js'
    #),
    default_to_term_comparison=False
)

file_name = 'movie_pair_plot_phates.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)