convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])

embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = umap.UMAP(min_dist=0.5, metric='cosine').fit_transform(embeddings).T
projection = pd.DataFrame({'term': corpus.get_metadata(),
                           'x': projection_raw[0],
                           'y': projection_raw[1]}).set_index('term')

category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
html = st.produce_pca_explorer(corpus,
                               category=category,
                               category_name='Democratic',
                               not_category_name='Republican',
                               metadata=convention_df['speaker'],
                               width_in_pixels=1000,
                               use_non_text_features=True,
                               use_full_doc=True,
                               projection=projection,
                               scores=scores,
                               show_top_terms=False)
file_name = 'demo_pca_documents.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
Ejemplo n.º 2
0
df = st.SampleCorpora.RottenTomatoes.get_data()
df['parse'] = df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(df, category_col='category', parsed_col='parse')
          .build()
          .get_unigram_corpus()
          .compact(st.AssociationCompactor(1000)))

corpus, axes = st.EmbeddingsResolver(corpus).set_embeddings_model().project_embeddings()
term_colors = st.CategoryColorAssigner(corpus).get_term_colors()
html = st.produce_pca_explorer(corpus,
                               category='fresh',
                               not_categories=['rotten'],
                               neutral_categories=['plot'],
                               metadata=df['movie_name'],
                               width_in_pixels=1000,
                               show_axes=False,
                               use_full_doc=True,
                               projection=axes,
                               term_colors=term_colors,
                               show_characteristic=False,
                               show_top_terms=False,
                               unified_context=True,
                               show_category_headings=True,
                               show_cross_axes=False,
                               include_term_category_counts=True,
                               color_func="(function(d) {return modelInfo.term_colors[d.term]})",
                               )
file_name = 'demo_unified_context.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
Ejemplo n.º 3
0
          get_stoplisted_unigram_corpus().remove_infrequent_words(
              minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k=3, maxiter=20000, which='LM')

x_dim = 0
y_dim = 1
projection = pd.DataFrame({
    'term': corpus.get_terms(),
    'x': U.T[x_dim],
    'y': U.T[y_dim]
}).set_index('term')

html = st.produce_pca_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    projection=projection,
    metadata=convention_df['speaker'],
    width_in_pixels=1000,
    x_dim=x_dim,
    y_dim=y_dim,
    show_axes_and_cross_hairs=True,
    y_axis_values=[projection['y'].min(), 0, projection['y'].max()],
    x_axis_values=[projection['x'].min(), 0, projection['x'].max()],
    x_axis_values_format='.1f')
file_name = 'demo_axis_crossbars_and_labels.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')
df = st.SampleCorpora.RottenTomatoes.get_data()
df['parse'] = df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(df, category_col='category', parsed_col='parse')
          .build()
          .get_unigram_corpus()
          .select(st.AssociationCompactor(1000)))

corpus, axes = st.EmbeddingsResolver(corpus).set_embeddings_model().project_embeddings()
term_colors = st.CategoryColorAssigner(corpus).get_term_colors()
html = st.produce_pca_explorer(corpus,
                               category='fresh',
                               not_categories=['rotten'],
                               neutral_categories=['plot'],
                               metadata=df['movie_name'],
                               width_in_pixels=1000,
                               show_axes=False,
                               use_full_doc=True,
                               projection=axes,
                               term_colors=term_colors,
                               show_characteristic=False,
                               show_top_terms=False,
                               unified_context=True,
                               show_category_headings=True,
                               show_cross_axes=False,
                               include_term_category_counts=True,
                               color_func="(function(d) {return modelInfo.term_colors[d.term]})",
                               )
file_name = 'demo_unified_context.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
    convention_df, category_col='party',
    parsed_col='parse').build().get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])

embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
u, s, vt = svds(embeddings, k=3, maxiter=20000, which='LM')
projection = pd.DataFrame({
    'term': corpus.get_metadata(),
    'x': u.T[0],
    'y': u.T[1]
}).set_index('term')

category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)
          ).astype(int)
html = st.produce_pca_explorer(corpus,
                               category=category,
                               category_name='Democratic',
                               not_category_name='Republican',
                               metadata=convention_df['speaker'],
                               width_in_pixels=1000,
                               show_axes=False,
                               use_non_text_features=True,
                               use_full_doc=True,
                               projection=projection,
                               scores=scores,
                               show_top_terms=False)
file_name = 'demo_pca_documents.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(
    st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse').build().
          get_stoplisted_unigram_corpus().remove_infrequent_words(
              minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k=3, maxiter=20000, which='LM')

x_dim = 0
y_dim = 1
projection = pd.DataFrame({
    'term': corpus.get_terms(),
    'x': U.T[x_dim],
    'y': U.T[y_dim]
}).set_index('term')
html = st.produce_pca_explorer(corpus,
                               category='democrat',
                               category_name='Democratic',
                               not_category_name='Republican',
                               projection=projection,
                               metadata=convention_df['speaker'],
                               width_in_pixels=1000,
                               x_dim=x_dim,
                               y_dim=y_dim)
file_name = 'demo_embeddings_svd_%s_%s.html' % (x_dim, y_dim)
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')
Ejemplo n.º 7
0
#Eigen value matrix creation
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
u, s, vt = svds(embeddings, k=167, maxiter=20000, which='LM')
projection = pd.DataFrame({'term': corpus.get_metadata(), 'x': u.T[0], 'y': u.T[1]}).set_index('term')


#plotting of graph 
category = 'positive'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
html = st.produce_pca_explorer(corpus,
                               category=category,
                               category_name='positive',
                               not_category_name='negative',
                               metadata=Data_join['author'],
                               width_in_pixels=1000,
                               show_axes=False,
                               use_non_text_features=True,
                               use_full_doc=True,
                               projection=projection,
                               scores=scores,
                               show_top_terms=False)
open("C:/Users/Ruchira Talekar/Desktop/Convention-Visualization1.html", 'wb').write(html.encode('utf-8'))


#################Comparison Graph####################################################################################

#Datajoin3 sentiment analysis
import nltk.sentiment.vader
from nltk.corpus import stopwords
import nltk.tokenize as nt  
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus()
          .remove_infrequent_words(minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k = 3, maxiter=20000, which='LM')

x_dim = 0; y_dim = 1
projection = pd.DataFrame({'term':corpus.get_terms(),
                           'x':U.T[x_dim],
                           'y':U.T[y_dim]}).set_index('term')
html = st.produce_pca_explorer(corpus,
                               category='democrat',
                               category_name='Democratic',
                               not_category_name='Republican',
                               projection=projection,
                               metadata=convention_df['speaker'],
                               width_in_pixels=1000,
                               x_dim=x_dim,
                               y_dim=y_dim)
file_name = 'demo_embeddings_svd_%s_%s.html' % (x_dim, y_dim)
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')