def main(): shisei = _parse_geutenberg( 'http://www.gutenberg.org/files/31617/31617-0.txt') horadanshaku = _parse_geutenberg( 'http://www.gutenberg.org/files/34084/34084-0.txt') df = pd.DataFrame({ 'text': [shisei, horadanshaku], 'title': ['Shisei', 'Horadanshaku tabimiyage'], 'author': ['Akutagawa Ryunosuke', 'Kuni Sasaki'] }) df['text'] = df['text'].apply(st.japanese_nlp) corpus = st.CorpusFromParsedDocuments(df, category_col='title', parsed_col='text').build() html = st.produce_scattertext_explorer( corpus, category='Shisei', category_name='Shisei', not_category_name='Horadanshaku tabimiyage', minimum_term_frequency=5, width_in_pixels=1000, metadata=df['title'] + ' by ' + df['author'], asian_mode=True) open('./demo_japanese.html', 'w').write(html) print('Open ./demo_japanese.html in Chrome or Firefox.')
def scattertext_function(self): ## START nlp = spacy.load('en_core_web_sm') convention_df = pd.read_csv( "After_Classification/After_Classification_NY_6.csv") convention_df['parsed'] = convention_df.tweet.apply(nlp) ##Index(['Unnamed: 0', 'Date', 'name', 'tweet', 'death', 'Classification'], dtype='object') # print("Document Count") # print(convention_df.groupby('Classification')['tweet'].count()) # print("Word Count") # print(convention_df.groupby('Classification').apply(lambda x: x.tweet.apply(lambda x: len(x.split())).sum())) # print(type(convention_df)) ##Convert Dataframe into Scattertext Corpus corpus = st.CorpusFromParsedDocuments(convention_df, category_col='Classification', parsed_col='parsed').build() print(type(st.Scalers.log_scale_standardize)) list(corpus.get_scaled_f_scores_vs_background().index[:10]) html = st.produce_scattertext_explorer( corpus, category='pos', category_name='POS', not_category_name='NEG', minimum_term_frequency=5, width_in_pixels=1000, transform=st.Scalers.log_scale_standardize) file_name_1 = 'After_Classification_NY_6.html' open(file_name_1, 'wb').write(html.encode('utf-8')) print(IFrame(src=file_name_1, width=1200, height=700))
def create_scatter_text(writers, names, messages, nonames=False): my_df = pd.DataFrame({"author": names, "message": messages}) nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer()) my_df['parse'] = my_df['message'].apply(nlp) corpus = st.CorpusFromParsedDocuments( my_df, category_col='author', parsed_col='parse').build().get_unigram_corpus().compact( st.AssociationCompactor(2000)) if nonames: html = st.produce_scattertext_explorer(corpus, category=writers[0], category_name="Author_0", not_category_name="Author_1", minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, transform=st.Scalers.dense_rank) else: html = st.produce_scattertext_explorer(corpus, category=writers[0], category_name=writers[0], not_category_name=writers[1], minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, transform=st.Scalers.dense_rank) with open('./demo_compact.html', 'w') as f: f.write(html) f.close()
def create_corpus(self): # load cleaned df convention_df = self.clean_texts() # create parsed corpus convention_df.groupby('bias').apply( lambda x: x.text.apply(lambda x: len(x.split())).sum()) convention_df['parsed'] = convention_df.text.apply(nlp) corpus = st.CorpusFromParsedDocuments(convention_df, category_col='bias', parsed_col='parsed').build() # remove stop words stop_word_list = [ 'via getty', 'inbox', 'subscribe', '×', 'close ×', 'screen close', 'full screen', 'buy second', 'second continue', 'story continued', 'llc permission', '―', 'xe', '\\xe2\\x80\\x99', 'news', 'for reprint', 'llc', 'post', 'click', 'to', '’ve', 'unsupported on', 'share', 'that ’s', 'still', 'got', 'it', '37', 'of his', 'this report', 'ofs', 'fox', 'photos', '’m', 'is the', 's.', 'around', 'times', 'also', 'the', 'copyright', 'washington times', 'mr', 'press', 'wait', 'associated', 'unsubscribe', 'view', 'photo wait', 'http', '#', 'associated press', 'more videos', 'get', 'just watched', 'permission', 'however', 'b.', 'ms.', 'here©', 'device', 'copyright ©', 'paste', '10', 'the associated', 'contributed to', 'hide', 'and his', 'videos', 'said mr.', '_', '©', 'contributed', 'embed', 'n’t', '/', 'something', 'i', 'that they', 'read', 'for a', 'playback', 'must watch', 'washington post', 'just', 'to get', 'r', 'read more', 'toggle', 'more', 'i ’m', 'follow', 'is', 'https', ' ', 'said', 'mr.', 'unsupported', 'or blog', 'your device', 'for', 'cnn', 'of 76', 'that', 'ms', 'andhis', 'click here', 'or share', 'replay', 'press contributed', 'they', 'must', 'prof', 'www', 'it ’s', 'told', '’re', 'the washington', '1', "'s rise", '© 2018', 'to this', 'skip', 'around the', 'blog', 'cut', 'told fox', 'mrs.', 'hide caption', 'ad', 'watched', '/ the', 'replay more', 'and the', '’s', '2018', 'copy', '&', 'read or', 'reprint permission', 'are', 'told cnn', 'watch', 'here for', 'also said', 'copy this', 'reprint', 'report', 'advertisement', 'mrs', 'caption', 'autoplay', 'fox news', 'dr', 'enlarge', 'times llc', '76', 'photo', 'this' ] stop_word_list = list(set(stop_word_list)) update_stop = [] for term in stop_word_list: if term in corpus._term_idx_store: update_stop.append(term) corpus = corpus.remove_terms(update_stop) return corpus
def get_scattertext_corpus(df, dep_data_col, group1_name, group2_name, lang="en"): cut_off = 3 # nur als Beispiel! muss letztlich vom User angegeben werden wie er die Gruppen bestimmen will (falls er nicht schon gelabelte Daten hoch lädt) df.loc[df[dep_data_col] > cut_off, 'label'] = group1_name df.loc[df[dep_data_col] < cut_off, 'label'] = group2_name df.dropna(inplace=True, axis=0) df["lemmas"] = df["nlp"].apply(lambda doc: [ tok.text for tok in doc if not tok.is_punct and not tok.is_stop and len(tok.text) > 1 ]) df["lemmas"] = df["lemmas"].apply(lambda text: nlp(" ".join(text))) st_corpus = scatter_text.CorpusFromParsedDocuments( df, category_col='label', parsed_col='lemmas').build().remove_terms(stopwords, ignore_absences=True) return st_corpus
def empath(self, dataframe): feat_builder = st.FeatsFromOnlyEmpath() empath_corpus = st.CorpusFromParsedDocuments( dataframe, category_col='Document Type', feats_from_spacy_doc=feat_builder, parsed_col='Text').build() html = st.produce_scattertext_explorer( empath_corpus, category='submission', category_name='Submission', not_category_name='Standard', width_in_pixels=1000, metadata=dataframe['Document'], use_non_text_features=True, use_full_doc=True, topic_model_term_lists=feat_builder.get_top_model_term_lists()) logger.getLogger().info("Opening Empath Visual") open(self.empath_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.empath_file)
zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx')) df['first_name'] = df['User Name'].apply(lambda x: x.split()[0].lower( ) if type(x) == str and len(x.split()) > 0 else x) male_prob = agefromname.AgeFromName().get_all_name_male_prob() df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True) df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?') df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])] df_mf.to_csv('emoji_data.csv', index=False) nlp = st.tweet_tokenzier_factory(nltk.tokenize.TweetTokenizer()) df_mf['parse'] = df_mf['Tweet content'].apply(nlp) corpus = st.CorpusFromParsedDocuments( df_mf, parsed_col='parse', category_col='gender', feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()).build() html = st.produce_scattertext_explorer( corpus, category='f', category_name='Female', not_category_name='Male', use_full_doc=True, term_ranker=OncePerDocFrequencyRanker, sort_by_dist=False, metadata=(df_mf['User Name'] + ' (@' + df_mf['Nickname'] + ') ' + df_mf['Date'].astype(str)), width_in_pixels=1000)
import scattertext as st import spacy nlp = spacy.load('en_core_web_sm') df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(nlp)) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse', feats_from_spacy_doc=st.FeatsFromSpacyDoc( use_lemmas=True)).build().get_unigram_corpus().compact( st.AssociationCompactor(2000)) html = st.produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, max_overlapping=3) open('./demo_lemmas.html', 'w').write(html) print('open ./demo_lemmas.html in Chrome')
nlp = spacy.load('en') convention_df = st.SampleCorpora.ConventionData2012.get_data( ).assign( parse=lambda df: df.text.apply(nlp), party=lambda df: df.party.apply( {#'democrat': #'Democratic', #'republican': '#Republican'}.get ) ) corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=st.PyTextRankPhrases() ).build( ).compact( st.AssociationCompactor(2000, use_non_text_features=True) class Stats_Graph_Manager: def __init__(self): pass
import scattertext as st nlp = spacy.load('en', parser=False) t0 = time.time() print('reading dataset') reviews_df = pd.read_csv( 'https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2' ) print('parsing', time.time() - t0, 's') reviews_df['parse'] = reviews_df['review'].apply( st.whitespace_nlp_with_sentences) print('building full corpus', time.time() - t0, 's') full_corpus = ( st.CorpusFromParsedDocuments( reviews_df, category_col='category', parsed_col='parse', #feats_from_spacy_doc=st.PhraseMachinePhrases() ).build()) term_ranker = st.OncePerDocFrequencyRanker corpus = (full_corpus.keep_only_these_categories([ 'Accept, Positive', 'Accept, Negative', 'Reject, Positive', 'Reject, Negative' ], False).get_unigram_corpus().compact( st.ClassPercentageCompactor(term_count=5))) print('finding priors', time.time() - t0, 's') priors = (st.PriorFactory( full_corpus, starting_count=0.01).use_all_categories().get_priors()) print('building four square', time.time() - t0, 's')
spacy.explain("prep")) # to understand tags noun_chunks_df = pd.DataFrame() for i, chunk in enumerate(parsed_review.noun_chunks): noun_chunks_df.loc[i, 'text'] = chunk.text noun_chunks_df.loc[i, 'root'] = chunk.root, noun_chunks_df.loc[i, 'root.text'] = chunk.root.text, noun_chunks_df.loc[i, 'root.dep_'] = chunk.root.dep_ noun_chunks_df.loc[i, 'root.head.text'] = chunk.root.head.text print(noun_chunks_df[:20]) nlp = spacy.load('en_core_web_sm', disable_pipes=["tagger", "ner"]) train_df['parsed'] = train_df.Text[49500:50500].apply(nlp) corpus = st.CorpusFromParsedDocuments(train_df[49500:50500], category_col='Score', parsed_col='parsed').build() from sense2vec.vectors import VectorMap s2v = Sense2VecComponent('data/reddit_vectors-1.1.0/reddit_vectors-1.1.0') spacy_tok.add_pipe(s2v) doc = spacy_tok(u"dessert.") freq = doc[0]._.s2v_freq vector = doc[0]._.s2v_vec most_similar = doc[0]._.s2v_most_similar(5) print(most_similar, freq) doc = spacy_tok(u"burger") most_similar = doc[0]._.s2v_most_similar(4) print(most_similar)
with tempfile.NamedTemporaryFile(delete=True) as tempf: with tempfile.NamedTemporaryFile(delete=True) as tempm: tempf.write(('\n'.join(documents)).encode()) mod = spm.SentencePieceTrainer.Train( '--input=%s --model_prefix=%s --vocab_size=%s' % (tempf.name, tempm.name, vocab_size)) sp = spm.SentencePieceProcessor() sp.load(tempm.name + '.model') return sp sp = train_sentence_piece_tokenizer(convention_df.text.values, 2000) corpus = st.CorpusFromParsedDocuments( convention_df, parsed_col='parse', category_col='party', feats_from_spacy_doc=st.FeatsFromSentencePiece(sp)).build() html = st.produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', sort_by_dist=False, metadata=convention_df['party'] + ': ' + convention_df['speaker'], term_scorer=st.RankDifference(), transform=st.Scalers.dense_rank, use_non_text_features=True, use_full_doc=True, )
import scattertext as st from scattertext import RankDifference convention_df = st.SampleCorpora.ConventionData2012.get_data() convention_df['parse'] = convention_df['text'].apply( st.whitespace_nlp_with_sentences) unigram_corpus = (st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse').build().get_stoplisted_unigram_corpus()) topic_model = ( st.SentencesForTopicModeling(unigram_corpus).get_topics_from_terms( [ 'obama', 'romney', 'democrats', 'republicans', 'health', 'military', 'taxes', 'education', 'olympics', 'auto', 'iraq', 'iran', 'israel' ], scorer=RankDifference(), num_terms_per_topic=20)) topic_feature_builder = st.FeatsFromTopicModel(topic_model) topic_corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=topic_feature_builder).build() html = st.produce_scattertext_explorer( topic_corpus,
df_1 = df.groupby( [ "author"] ).size().reset_index(name='Counts') df_1 = df_1.sort_values(by=['Counts'], ascending=False) df_1 = df_1.head(2) df_2 = df_1.merge(df, on='author', how='inner') df_2 = df_2.sort_values(by=['Counts'], ascending=False) build_corpus = st.CorpusFromPandas(df_2, category_col='author', text_col='clean_article_text', nlp=nlp).build() df_freq = build_corpus.get_term_freq_df() df_freq['GLOBE EDITORIAL SCORE'] = build_corpus.get_scaled_f_scores('GLOBE EDITORIAL') df_freq['Jeffrey Simpson Score'] = build_corpus.get_scaled_f_scores('Jeffrey Simpson') html = st.produce_scattertext_explorer(build_corpus, category='GLOBE EDITORIAL', category_name='GLOBE EDITORIAL', not_category_name='Jeffrey Simpson', width_in_pixels=1000, metadata=df_2['author']) open("../output/Top_2_Authors.html", 'wb').write(html.encode('utf-8')) #visualizing Empath topics and categories instead of terms build_feats = st.FeatsFromOnlyEmpath() build_corpus_2 = st.CorpusFromParsedDocuments(df_2,category_col='author', feats_from_spacy_doc=build_feats, parsed_col='clean_article_text').build() html = st.produce_scattertext_explorer(build_corpus_2,category='GLOBE EDITORIAL',category_name='GLOBE EDITORIAL',not_category_name='Jeffrey Simpson',width_in_pixels=1000,metadata=df_2['author'],use_non_text_features=True,use_full_doc=True,topic_model_term_lists=build_feats.get_top_model_term_lists()) open("../output/Top_2_Authors-Empath.html", 'wb').write(html.encode('utf-8'))
import scattertext as st import gensim movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) movie_df['parse'] = movie_df.text.apply(st.whitespace_nlp_with_sentences) corpus = st.CorpusFromParsedDocuments( movie_df, category_col='movie_name', parsed_col='parse').build().get_stoplisted_unigram_corpus() category_projection = st.Doc2VecCategoryProjector().project(corpus) html = st.produce_pairplot( corpus, category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name'], scaler=st.Scalers.scale_0_to_1, d3_url_struct=st.D3URLs( d3_scale_chromatic_url= 'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js')) file_name = 'movie_pair_plot_d2v.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st import spacy nlp = spacy.load('en_core_web_sm') df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: list(nlp.pipe(df.text))) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse', feats_from_spacy_doc=st.SpacyEntities( entity_types_to_use=['NAME', 'LOC'])).build() html = st.produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, max_overlapping=10, max_docs_per_category=0) open('./demo_names2.html', 'w').write(html) print('open ./demo_names2.html in Chrome')
def Plot_Clusters_Kmeans(outputfile_name, nbr_clusters, path): list_stop_words = [stemmer.stem(stopWords[i]) for i in range(len(stopWords))] text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Cluster', text_col='Text', nlp=nlp).build(). \ remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True). get_unigram_corpus().compact(st.ClassPercentageCompactor(term_count=2, term_ranker=st.OncePerDocFrequencyRanker))) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" try: os.mkdir(directory) except FileExistsError: pass html = st.produce_scattertext_explorer(corpus, category=str(i), category_name=str(i)+" Category", not_category_name='Other Categories', metadata=text2kw_clusters['Date'], minimum_term_frequency=50) filename = directory+str(i)+"_Category-VS-other categories.html" open(filename, 'wb+').write(html.encode('utf-8')) text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp) corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Cluster', parsed_col='Text')).build(). \ remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" m = text2kw_clusters[text2kw_clusters["Cluster"] == str(i)] liste = [word_tokenize(str(x)) for x in m["processedReviews"] if not stemmer.stem(str(x)) in list_stop_words] words = [] for j in range(len(liste)): for k in range(len(liste[j])): if not (liste[j][k] in list_stop_words): try: words.append(liste[j][k]) except: pass counter = collections.Counter(words) c = counter.most_common() html = word_similarity_explorer_gensim(corpus, category=str(i), category_name=str(i)+" Category", not_category_name='Other Categories', minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005), target_term=stemmer.stem(c[0][0]), # pmi_threshold_coefficient=4, width_in_pixels=1000, metadata=text2kw_clusters['Date'], word2vec=model, max_p_val=0.05, save_svg_button=True) filename = directory+str(i)+"_w2v_Category-VS-other categories.html" open(filename, 'wb+').write(html.encode('utf-8')) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str) text2kw_clusters['Date'] = text2kw_clusters['Date'].astype(str) text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)] text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"] corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Sentiments', text_col='Text', nlp=nlp).build(). remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)) html = st.produce_scattertext_explorer(corpus, category="positive", category_name="Positive Verbatims", not_category_name='Negative Verbatims', metadata=text2kw_clusters['Date'], minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005)) filename = directory+str(i)+"_Positive_Category-VS-Negative_Category.html" open(filename, 'wb+').write(html.encode('utf-8')) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str) text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)] text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"] text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp) liste = [word_tokenize(str(x)) for x in text2kw_clusters["processedReviews"] if not stemmer.stem(str(x)) in list_stop_words] words = [] for j in range(len(liste)): for k in range(len(liste[j])): if not (liste[j][k] in list_stop_words): try: words.append(liste[j][k]) except: pass counter = collections.Counter(words) c = counter.most_common() corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Sentiments', parsed_col='Text')).build(). \ remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True) html = word_similarity_explorer_gensim(corpus, category="positive", category_name="Positive Verbatims", not_category_name='Negative Verbatims', minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005), target_term=stemmer.stem(c[0][0]), # pmi_threshold_coefficient=4, width_in_pixels=1000, metadata=text2kw_clusters['Date'], word2vec=model, max_p_val=0.05, save_svg_button=True) filename = directory+str(i)+"_w2v__Positive_Category-VS-Negative_Category.html" open(filename, 'wb+').write(html.encode('utf-8'))
data = [ {'text': "I don't think you'll want to.", 'category': 'a'}, {'text': "You'll have a didn't a-b #dfs .", 'category': 'a'}, {'text': "You'll shoudn't #have a, didn't a-b #dfs .", 'category': 'a'}, {'text': "Can't not get along to didn't.", 'category': 'b'}, {'text': "Can't try aba-ba alo33ng to didn't.", 'category': 'b'}, {'text': "Can't no't g'e't al33ong 3to5.", 'category': 'b'}, {'text': "You haven't changed a b'it.", 'category': 'c'}, {'text': "You haven't changed a b'it.", 'category': 'c'}, {'text': "You haven't ch5ng3d a bit.", 'category': 'c'} ] df = pd.DataFrame(data) df['parse'] = df.text.apply(lambda x: st.whitespace_nlp_with_sentences(x, tok_splitter_re=re.compile('( )'))) corpus = st.CorpusFromParsedDocuments(df, parsed_col='parse', category_col='category').build().get_unigram_corpus() semiotic_square = st.SemioticSquare( corpus, category_a='a', category_b='b', neutral_categories=['c'], scorer=st.RankDifference(), labels={'not_a_and_not_b': 'Plot Descriptions', 'a_and_b': 'Reviews', 'a_and_not_b': 'Positive', 'b_and_not_a': 'Negative', 'a':'', 'b':'', 'not_a':'', 'not_b':''}
'income' ], 'jobs': ['jobs', 'workers', 'labor', 'employment', 'worker', 'employee', 'job'], 'patriotic': ['america', 'country', 'flag', 'americans', 'patriotism', 'patriotic'], 'family': [ 'mother', 'father', 'mom', 'dad', 'sister', 'brother', 'grandfather', 'grandmother', 'son', 'daughter' ] } topic_feature_builder = st.FeatsFromTopicModel(topic_model) topic_corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=topic_feature_builder).build() html = st.produce_scattertext_explorer( topic_corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, pmi_threshold_coefficient=0, topic_model_term_lists=topic_feature_builder.get_top_model_term_lists())
import scattertext as st import pandas as pd from sklearn.feature_extraction.text import TfidfTransformer from scipy.sparse.linalg import svds convention_df = st.SampleCorpora.ConventionData2012.get_data() convention_df['parse'] = convention_df['text'].apply( st.whitespace_nlp_with_sentences) corpus = (st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parse').build(). get_stoplisted_unigram_corpus().remove_infrequent_words( minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker)) embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T U, S, VT = svds(embeddings, k=3, maxiter=20000, which='LM') x_dim = 0 y_dim = 1 projection = pd.DataFrame({ 'term': corpus.get_terms(), 'x': U.T[x_dim], 'y': U.T[y_dim] }).set_index('term') html = st.produce_pca_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', projection=projection, metadata=convention_df['speaker'],
# ================================================================================ all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds( ) # ================================================================================ columns = ['senti_on_Metfor_oral', 'feature', 'review'] all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame( all_satisfaction_score_comment_in_all_conds, index=None, columns=columns) all_satisfaction_score_comment_in_all_conds_df[ 'parse'] = all_satisfaction_score_comment_in_all_conds_df['review'].apply( st.whitespace_nlp_with_sentences) # ================================================================================ corpus = (st.CorpusFromParsedDocuments( all_satisfaction_score_comment_in_all_conds_df, category_col='senti_on_Metfor_oral', parsed_col='parse').build().get_stoplisted_unigram_corpus()) # ================================================================================ html = st.produce_projection_explorer( corpus, category='negative', category_name='Negative', not_category_name='Positive', metadata=all_satisfaction_score_comment_in_all_conds_df.feature, width_in_pixels=1000) # ================================================================================ file_name = '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/demo_tsne_style.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open', file_name, 'in chrome')
from sklearn.decomposition import TruncatedSVD import scattertext as st from scattertext import ClassPercentageCompactor, CSRMatrixFactory from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator convention_df = st.SampleCorpora.ConventionData2012.get_data() convention_df['parse'] = convention_df['text'].apply( st.whitespace_nlp_with_sentences) corpus = (st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse').build().get_stoplisted_unigram_corpus().select( ClassPercentageCompactor(term_count=3))) html = st.produce_projection_explorer(corpus, embeddings=corpus.get_term_doc_mat(), projection_model=TruncatedSVD( n_components=30, n_iter=10), x_dim=0, y_dim=1, category='democrat', category_name='Democratic', not_category_name='Republican', metadata=convention_df.speaker, width_in_pixels=1000) file_name = 'demo_bow_pca.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open', file_name, 'in chrome')
################################################Scatterplot2################################################################### import pandas as pd from sklearn.feature_extraction.text import TfidfTransformer import scattertext as st from scipy.sparse.linalg import svds Data_join['parse'] = Data_join['content'].apply(st.whitespace_nlp_with_sentences) #Corpus for scatterplot2 corpus = (st.CorpusFromParsedDocuments(Data_join, category_col='review', parsed_col='parse') .build() .get_stoplisted_unigram_corpus()) corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['author']) corpus.get_df()['content'] len(corpus.get_metadata()) print(corpus.get_term_doc_mat()) #Eigen value matrix creation embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()) u, s, vt = svds(embeddings, k=167, maxiter=20000, which='LM') projection = pd.DataFrame({'term': corpus.get_metadata(), 'x': u.T[0], 'y': u.T[1]}).set_index('term')
df = pd.read_csv('./complete_data.tsv', sep='\t') data_nix_ken = pd.read_csv('./stats/data_nix_ken.tsv', sep='\t') if not os.path.exists('plots'): os.makedirs('plots') # Scattertext attack vs support (only responses) # https://kanoki.org/2019/03/17/text-data-visualization-in-python/ # https://github.com/JasonKessler/scattertext nlp = spacy.load('en_core_web_sm') for data_set in ['debate_test', 'debate_train', 'procon', 'political']: df_plot = df.loc[(df['org_dataset'] == data_set) & (df['label'].isin(['attack', 'support']))] df_plot['parsed'] = df_plot['response'].apply(nlp) corpus = st.CorpusFromParsedDocuments(df_plot, category_col='label', parsed_col='parsed').build() html = st.produce_scattertext_explorer( corpus, category='attack', not_category_name='support', width_in_pixels=1000, minimum_term_frequency=5, transform=st.Scalers.log_scale_standardize, use_full_doc=True) file_name = './plots/scattertext_attack_support' + data_set + '.html' with open(file_name, 'wb') as file: file.write(html.encode('utf-8')) # Scattertext Nixon vs Kennedy df_plot = data_nix_ken
import scattertext as st import spacy nlp = spacy.blank('en_core_web_sm') nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items() if "'" not in key and "’" not in key and "‘" not in key} nlp.add_pipe(nlp.create_pipe('sentencizer')) df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(nlp) ) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse' ).build().compact(st.ClassPercentageCompactor(term_count=10)) html = st.produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, show_diagonal=False, max_overlapping=3 ) open('./demo_with_apostrophes.html', 'w').write(html) print('open ./demo_with_apostrophes.html in Chrome')
import scattertext as st df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse').build().get_unigram_corpus().compact( st.AssociationCompactor(2000)) html = st.produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, show_diagonal=False, max_overlapping=3, vertical_lines=0.5) open('./demo_vertical_lines.html', 'w').write(html) print('open ./demo_vertical_lines.html in Chrome')
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() convention_df['parse'] = convention_df['text'].apply( st.whitespace_nlp_with_sentences) corpus = (st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse').build().get_stoplisted_unigram_corpus()) html = st.produce_projection_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', metadata=convention_df.speaker, color_func= '''(function(d) {return d.s > 0.5 ? d3.interpolateRdYlBu(0.6) : d3.interpolateRdYlBu(0.4) })''', center_label_over_points=True, censor_points=True, width_in_pixels=1000) file_name = 'demo_tsne_style_for_publication.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open', file_name, 'in chrome')