Python fit_lda Exemples, nlpbasic.lda.fit_lda Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Covid19_vaccination_twitter_analysis.py Projet : jinfeijoy/NLP

test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True)
test.orig_text[5]


# In[49]:


test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30))
DataExploration.generate_word_cloud(list(test['explore_text']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10)
lda.lda_topics(lda_allbow)


# From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral.
# 
# In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc.

# In[76]:


canada_overtime = data[data.country.isin(['Canada'])].groupby(['date', 'sentiment']).agg(**{'tweets': ('id', 'count')}).reset_index().dropna()
fig = px.line(canada_overtime, x='date', y='tweets', color='sentiment',
             title='Timeline showing emotion of tweets in Canada about COVID-19 vaccines')
fig.show()

Exemple #2

0

Afficher le fichier

                                           remove_pattern=[],
                                           check_numbers=True,
                                           word_length=2,
                                           remove_consecutives=True)

# In[43]:

selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

DataExploration.generate_word_cloud(selected_tokens)

# ## Fit model based on title

# In[58]:

train_index, test_index = train_test_split(train.index,
                                           test_size=0.33,
                                           random_state=42)
X_train = train[train.index.isin(train_index)][['TITLE']]
X_test = train[train.index.isin(test_index)][['TITLE']]

Exemple #3

0

Afficher le fichier

Fichier : Basic_twitter_sentiment_analysis_smallDataset.py Projet : jinfeijoy/NLP

# In[17]:

DataExploration.generate_word_cloud(pos_tweet)

# In[18]:

DataExploration.generate_word_cloud(neg_tweet)

# We didn't remove stop words, so the LDA does not work well, to do topic modelling, we need to remove stop words. But for sentiment analysis, it is better to keep all words.
# However, even for sentiment analysis, we need to set minimal words length, there we need to set it as 2.

# In[20]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# ## Prepare training/testing/validation dataset

# In[3]:

X = [x for x in data.text]
y = pd.get_dummies(data.label).values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=11)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.125,

Exemple #4

0

Afficher le fichier

Fichier : chinese_keyword_tag_recommender.py Projet : jinfeijoy/NLP

test.head(3)


# In[107]:


display(tfidf_data.head(3))
list(test.title_x)


# ## LDA

# In[133]:


lda_allbow, bow_corpus, dictionary = lda.fit_lda(sentences, num_topics = 5)


# In[134]:


lda.lda_topics(lda_allbow)


# In[135]:


import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

Exemple #5

0

Afficher le fichier

top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[18]:


DataExploration.generate_word_cloud(data['token'].to_list())


# In[19]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# In[33]:


tfidf_data = tfidf.get_tfidf_dataframe(data['token'].to_list(), 
                                       doc_index = list(data.doc_id),
                                       no_below =5, 
                                       no_above = 0.5, 
                                       keep_n = 100000)


# In[28]:

Exemple #6

0

Afficher le fichier

DataExploration.generate_word_cloud(pos_tweet)

# In[41]:

DataExploration.generate_word_cloud(neg_tweet)

# In[42]:

DataExploration.generate_word_cloud(neu_tweet)

# ## LDA

# In[43]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
                                                 num_topics=no_topics)
# lda_top30bow, bow_corpus, dictionary  = lda.fit_lda(pos_tweet, top_n_tokens = 30, num_topics = no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(neg_tweet,
                                                 num_topics=no_topics)
# lda_top30bow, bow_corpus, dictionary  = lda.fit_lda(neg_tweet, top_n_tokens = 30, num_topics = no_topics)
lda.lda_topics(lda_allbow)

# ## Explore text for each vacc

# In[51]:

Exemple #7

0

Afficher le fichier

top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[28]:


DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())


# In[29]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# ### Words Frequency

# In[8]:


dictionary = DocVector.generate_corpus_dict(processed_letter_df['tokens'].to_list(), no_below =1,
                                            no_above = 1, keep_n = 100000)
bow_corpus = DocVector.create_document_vector(processed_letter_df['tokens'].to_list(), dictionary)
my_df = DocVector.get_vocab_matrix(bow_corpus, dictionary)


# In[31]:

Exemple #8

0

Afficher le fichier

    print(printdata.description.item())
    print('-------------------- Requirements --------------------')
    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[5]:

profile_tokens = list(raw_data['profile_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[6]:

profile_tokens = list(raw_data['description_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[7]:

profile_tokens = list(raw_data['requirements_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))

Exemple #9

0

Afficher le fichier

Fichier : stock_news_sentiment_exploration.py Projet : jinfeijoy/NLP

                                   stop_word_list=[],
                                   remove_pattern=['http:', '#', '@'],
                                   check_numbers=True,
                                   word_length=2,
                                   remove_consecutives=True)

# In[23]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[20]:

DataExploration.generate_word_cloud(djia_tokens)

# In[27]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(nasdaq_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(nasdaq_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10

Exemple #10

0

Afficher le fichier


print_job(raw_data, 50)

# In[145]:

# raw_data['jd_tokens'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram = [1], lower_case=True,
#                                            deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma',
#                                            tag_drop = [], nltk_stop=True,
#                                            stop_word_list=[],
#                                            check_numbers=False, word_length=2, remove_consecutives=True)
fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens'])
print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(fraud_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[146]:

raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(),
                                            multi_gram=[2],
                                            lower_case=True,
                                            deacc=False,
                                            encoding='utf8',
                                            errors='strict',
                                            stem_lemma='lemma',
                                            tag_drop=[],
                                            nltk_stop=True,
                                            stop_word_list=[],
                                            check_numbers=False,