Ejemplo n.º 1
0
def fit_lda(corpus,
            no_below=5,
            no_above=0.5,
            keep_n=100000,
            top_n_tokens='',
            num_topics=5):
    """
    :param corpus: corpus: a list of list of tokens (generated from textClean.pipeline)
    :param no_below: filter out tokens that less than no_below documents (absolute number)
    :param no_above: filter out tokens that more than no_above documents (fraction of total corpus size, not absolute number).
    :param keep_n: filter out tokens that after (1) and (2), keep only the first keep_n most frequent tokens (or keep all if None).
    :param top_n_tokens: top n bag of words in tfidf global list
    :param num_topics: number of topics in lda model
    :return: lda model file, bow_corpus, dictionary
    """
    if top_n_tokens == '':
        processed_docs = corpus
    else:
        selected_tokens = tfidf.get_top_n_tfidf_bow(corpus, no_below, no_above,
                                                    keep_n, top_n_tokens)
        processed_docs = [
            list(i for i in token if i in selected_tokens) for token in corpus
        ]
    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=keep_n)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                           num_topics=num_topics,
                                           id2word=dictionary,
                                           passes=2,
                                           workers=2,
                                           random_state=100)
    return lda_model, bow_corpus, dictionary
# In[48]:


test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True)
test.orig_text[5]


# In[49]:


test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30))
DataExploration.generate_word_cloud(list(test['explore_text']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10)
lda.lda_topics(lda_allbow)


# From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral.
# 
# In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc.

# In[76]:


canada_overtime = data[data.country.isin(['Canada'])].groupby(['date', 'sentiment']).agg(**{'tweets': ('id', 'count')}).reset_index().dropna()
fig = px.line(canada_overtime, x='date', y='tweets', color='sentiment',
# print(data.clean_tweet[i])
print("-----------")
print(data.old_text[i])

# ## Explore Data
# We run Explore Data in sample dataset.

# In[15]:

pos_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'positive']['text'])
neu_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'neutral']['text'])
neg_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'negative']['text'])
postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)

# In[16]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[17]:

DataExploration.generate_word_cloud(pos_tweet)
Ejemplo n.º 4
0
                                           tag_drop=[],
                                           nltk_stop=True,
                                           stop_word_list=[],
                                           remove_pattern=[],
                                           check_numbers=True,
                                           word_length=2,
                                           remove_consecutives=True)

# In[43]:

selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

DataExploration.generate_word_cloud(selected_tokens)

# ## Fit model based on title

# In[58]:

train_index, test_index = train_test_split(train.index,
Ejemplo n.º 5
0
                                   errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, 
                                   stop_word_list=[], remove_pattern = [],
                                   check_numbers=True, word_length=2, remove_consecutives=True)


# In[16]:


top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(data['token'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[17]:


top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[18]:


DataExploration.generate_word_cloud(data['token'].to_list())


# In[19]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)
Ejemplo n.º 6
0
i = 1
print(data.orig_text[i])
print("-----------")
print(data.text[i])
print("-----------")
print(data.explore_text[i])

# In[58]:

pos_tweet = list(data[data['compound'] > 0]['explore_text'])
neg_tweet = list(data[data['compound'] < 0]['explore_text'])
neu_tweet = list(data[data['compound'] == 0]['explore_text'])

# In[59]:

postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
neutop10tfidf = tfidf.get_top_n_tfidf_bow(neu_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)
print('top 30 neutual review tfidf', neutop10tfidf)

# In[40]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[38]:
Ejemplo n.º 7
0
processed_letter_df = pd.read_csv(os.path.join(data_path, 'processed_letter.csv'))
processed_letter_df['tokens'] = processed_letter_df.clean_letter.apply(lambda x: x.split(' '))


# In[26]:


top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(processed_letter_df['tokens'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[27]:


top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[28]:


DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())


# In[29]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)
Ejemplo n.º 8
0
    print('-------------------- Company --------------------')
    print(printdata.company_profile.item())
    print('-------------------- Job Description --------------------')
    print(printdata.description.item())
    print('-------------------- Requirements --------------------')
    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[5]:

profile_tokens = list(raw_data['profile_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[6]:

profile_tokens = list(raw_data['description_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[7]:
Ejemplo n.º 9
0
    stem_lemma='lemma',
    tag_drop=['V'],
    nltk_stop=True,
    stop_word_list=['movie', 'film', 'movies', 'films'],
    check_numbers=True,
    word_length=3,
    remove_consecutives=True)

# ## Task 2: Create a Term Document Matrix using TF-IDF

# During the Day 2 lab, we created a term-document matrix by simply counting the occurence of words in each document. Let's try using TF-IDF to turn our documents in vectors here.

# In[5]:

tfidf_value_data = tfidf.get_tfidf_dataframe(preprocessed_tokens)
to10_tfidf_bow = tfidf.get_top_n_tfidf_bow(preprocessed_tokens,
                                           top_n_tokens=10)
to10_tfidf_bow

# In[6]:

dictionary = DocVector.generate_corpus_dict(preprocessed_tokens,
                                            no_below=1,
                                            no_above=0.5,
                                            keep_n=100000)
bow_corpus = DocVector.create_document_vector(preprocessed_tokens, dictionary)
tfidf_trans = models.TfidfModel(bow_corpus)
my_df = DocVector.get_vocab_matrix(tfidf_trans[bow_corpus], dictionary)

# In[7]:

my_df.head(3)
                                   stem_lemma='lemma',
                                   tag_drop=['V'],
                                   nltk_stop=True,
                                   stop_word_list=[],
                                   remove_pattern=['http:', '#', '@'],
                                   check_numbers=True,
                                   word_length=2,
                                   remove_consecutives=True)

# In[23]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[20]:

DataExploration.generate_word_cloud(djia_tokens)

# In[27]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(nasdaq_tokens, topn=10)
]
Ejemplo n.º 11
0
    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[145]:

# raw_data['jd_tokens'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram = [1], lower_case=True,
#                                            deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma',
#                                            tag_drop = [], nltk_stop=True,
#                                            stop_word_list=[],
#                                            check_numbers=False, word_length=2, remove_consecutives=True)
fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens'])
print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(fraud_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[146]:

raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(),
                                            multi_gram=[2],
                                            lower_case=True,
                                            deacc=False,
                                            encoding='utf8',
                                            errors='strict',
                                            stem_lemma='lemma',
                                            tag_drop=[],