# In[15]:

pos_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'positive']['text'])
neu_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'neutral']['text'])
neg_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'negative']['text'])
postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)

# In[16]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[17]:

DataExploration.generate_word_cloud(pos_tweet)

# In[18]:

DataExploration.generate_word_cloud(neg_tweet)

# We didn't remove stop words, so the LDA does not work well, to do topic modelling, we need to remove stop words. But for sentiment analysis, it is better to keep all words.
# However, even for sentiment analysis, we need to set minimal words length, there we need to set it as 2.
    print('vaccine ', len(df_filt))
    return df_filt

side_effect = find_side_effect(data).reset_index(drop=True)
side_effect = side_effect[side_effect.sentiment=='negative']
side_effect['explore_text'] = textClean.pipeline(side_effect['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)


# In[172]:


print(DataExploration.get_topn_freq_bow(list(side_effect['explore_text']), topn = 30))


# In[173]:


print(tfidf.get_top_n_tfidf_bow(list(side_effect['explore_text']), top_n_tokens = 30))


# In[174]:


DataExploration.generate_word_cloud(list(side_effect['explore_text']))


# In[162]:
Esempio n. 3
0
                                           encoding='utf8',
                                           errors='strict',
                                           stem_lemma='lemma',
                                           tag_drop=[],
                                           nltk_stop=True,
                                           stop_word_list=[],
                                           remove_pattern=[],
                                           check_numbers=True,
                                           word_length=2,
                                           remove_consecutives=True)

# In[43]:

selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

DataExploration.generate_word_cloud(selected_tokens)

# ## Fit model based on title
Esempio n. 4
0
data = raw_data.copy()


# In[15]:


data['token'] = textClean.pipeline(raw_data['text'].to_list(), multi_gram = [1], lower_case=True, deacc=False, encoding='utf8',
                                   errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, 
                                   stop_word_list=[], remove_pattern = [],
                                   check_numbers=True, word_length=2, remove_consecutives=True)


# In[16]:


top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(data['token'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[17]:


top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[18]:


DataExploration.generate_word_cloud(data['token'].to_list())
Esempio n. 5
0
pos_tokens[0]


# In[41]:


postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tokens, top_n_tokens = 30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tokens, top_n_tokens = 30)
print('top 10 negative review tfidf', negtop10tfidf)
print('top 10 positive review tfidf', postop10tfidf)


# In[36]:


top10_freq_list = DataExploration.get_topn_freq_bow(preprocessed_tokens, topn = 10)
top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tokens, topn = 10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tokens, topn = 10)
print(top10_freq_list)
print(top10_posfreq_list)
print(top10_negfreq_list)


# In[37]:


DataExploration.generate_word_cloud(pos_tokens)


# In[38]:
Esempio n. 6
0
processed_letter_df.to_csv(os.path.join(data_path, 'processed_letter.csv'))


# ## Data Exploration

# In[4]:


processed_letter_df = pd.read_csv(os.path.join(data_path, 'processed_letter.csv'))
processed_letter_df['tokens'] = processed_letter_df.clean_letter.apply(lambda x: x.split(' '))


# In[26]:


top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(processed_letter_df['tokens'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[27]:


top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[28]:


DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())
                                   deacc=False,
                                   encoding='utf8',
                                   errors='strict',
                                   stem_lemma='lemma',
                                   tag_drop=['V'],
                                   nltk_stop=True,
                                   stop_word_list=[],
                                   remove_pattern=['http:', '#', '@'],
                                   check_numbers=True,
                                   word_length=2,
                                   remove_consecutives=True)

# In[23]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[20]:

DataExploration.generate_word_cloud(djia_tokens)

# In[27]: