test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True)
test.orig_text[5]


# In[49]:


test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30))
DataExploration.generate_word_cloud(list(test['explore_text']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10)
lda.lda_topics(lda_allbow)


# From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral.
# 
# In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc.

# In[76]:


canada_overtime = data[data.country.isin(['Canada'])].groupby(['date', 'sentiment']).agg(**{'tweets': ('id', 'count')}).reset_index().dropna()
fig = px.line(canada_overtime, x='date', y='tweets', color='sentiment',
             title='Timeline showing emotion of tweets in Canada about COVID-19 vaccines')
fig.show()
Ejemplo n.º 2
0
                                           remove_pattern=[],
                                           check_numbers=True,
                                           word_length=2,
                                           remove_consecutives=True)

# In[43]:

selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

DataExploration.generate_word_cloud(selected_tokens)

# ## Fit model based on title

# In[58]:

train_index, test_index = train_test_split(train.index,
                                           test_size=0.33,
                                           random_state=42)
X_train = train[train.index.isin(train_index)][['TITLE']]
X_test = train[train.index.isin(test_index)][['TITLE']]
# In[17]:

DataExploration.generate_word_cloud(pos_tweet)

# In[18]:

DataExploration.generate_word_cloud(neg_tweet)

# We didn't remove stop words, so the LDA does not work well, to do topic modelling, we need to remove stop words. But for sentiment analysis, it is better to keep all words.
# However, even for sentiment analysis, we need to set minimal words length, there we need to set it as 2.

# In[20]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# ## Prepare training/testing/validation dataset

# In[3]:

X = [x for x in data.text]
y = pd.get_dummies(data.label).values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=11)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.125,
test.head(3)


# In[107]:


display(tfidf_data.head(3))
list(test.title_x)


# ## LDA

# In[133]:


lda_allbow, bow_corpus, dictionary = lda.fit_lda(sentences, num_topics = 5)


# In[134]:


lda.lda_topics(lda_allbow)


# In[135]:


import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
Ejemplo n.º 5
0
top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[18]:


DataExploration.generate_word_cloud(data['token'].to_list())


# In[19]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# In[33]:


tfidf_data = tfidf.get_tfidf_dataframe(data['token'].to_list(), 
                                       doc_index = list(data.doc_id),
                                       no_below =5, 
                                       no_above = 0.5, 
                                       keep_n = 100000)


# In[28]:
Ejemplo n.º 6
0
DataExploration.generate_word_cloud(pos_tweet)

# In[41]:

DataExploration.generate_word_cloud(neg_tweet)

# In[42]:

DataExploration.generate_word_cloud(neu_tweet)

# ## LDA

# In[43]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
                                                 num_topics=no_topics)
# lda_top30bow, bow_corpus, dictionary  = lda.fit_lda(pos_tweet, top_n_tokens = 30, num_topics = no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(neg_tweet,
                                                 num_topics=no_topics)
# lda_top30bow, bow_corpus, dictionary  = lda.fit_lda(neg_tweet, top_n_tokens = 30, num_topics = no_topics)
lda.lda_topics(lda_allbow)

# ## Explore text for each vacc

# In[51]:
Ejemplo n.º 7
0
top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[28]:


DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())


# In[29]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# ### Words Frequency

# In[8]:


dictionary = DocVector.generate_corpus_dict(processed_letter_df['tokens'].to_list(), no_below =1,
                                            no_above = 1, keep_n = 100000)
bow_corpus = DocVector.create_document_vector(processed_letter_df['tokens'].to_list(), dictionary)
my_df = DocVector.get_vocab_matrix(bow_corpus, dictionary)


# In[31]:
Ejemplo n.º 8
0
    print(printdata.description.item())
    print('-------------------- Requirements --------------------')
    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[5]:

profile_tokens = list(raw_data['profile_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[6]:

profile_tokens = list(raw_data['description_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[7]:

profile_tokens = list(raw_data['requirements_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
                                   stop_word_list=[],
                                   remove_pattern=['http:', '#', '@'],
                                   check_numbers=True,
                                   word_length=2,
                                   remove_consecutives=True)

# In[23]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[20]:

DataExploration.generate_word_cloud(djia_tokens)

# In[27]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(nasdaq_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(nasdaq_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
Ejemplo n.º 10
0

print_job(raw_data, 50)

# In[145]:

# raw_data['jd_tokens'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram = [1], lower_case=True,
#                                            deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma',
#                                            tag_drop = [], nltk_stop=True,
#                                            stop_word_list=[],
#                                            check_numbers=False, word_length=2, remove_consecutives=True)
fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens'])
print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(fraud_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[146]:

raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(),
                                            multi_gram=[2],
                                            lower_case=True,
                                            deacc=False,
                                            encoding='utf8',
                                            errors='strict',
                                            stem_lemma='lemma',
                                            tag_drop=[],
                                            nltk_stop=True,
                                            stop_word_list=[],
                                            check_numbers=False,