Python get_top_n_tfidf_bow Exemples, nlpbasic.tfidf.get_top_n_tfidf_bow Python Exemples

Exemple #1

0

Afficher le fichier

def fit_lda(corpus,
            no_below=5,
            no_above=0.5,
            keep_n=100000,
            top_n_tokens='',
            num_topics=5):
    """
    :param corpus: corpus: a list of list of tokens (generated from textClean.pipeline)
    :param no_below: filter out tokens that less than no_below documents (absolute number)
    :param no_above: filter out tokens that more than no_above documents (fraction of total corpus size, not absolute number).
    :param keep_n: filter out tokens that after (1) and (2), keep only the first keep_n most frequent tokens (or keep all if None).
    :param top_n_tokens: top n bag of words in tfidf global list
    :param num_topics: number of topics in lda model
    :return: lda model file, bow_corpus, dictionary
    """
    if top_n_tokens == '':
        processed_docs = corpus
    else:
        selected_tokens = tfidf.get_top_n_tfidf_bow(corpus, no_below, no_above,
                                                    keep_n, top_n_tokens)
        processed_docs = [
            list(i for i in token if i in selected_tokens) for token in corpus
        ]
    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=keep_n)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                           num_topics=num_topics,
                                           id2word=dictionary,
                                           passes=2,
                                           workers=2,
                                           random_state=100)
    return lda_model, bow_corpus, dictionary

Exemple #2

0

Afficher le fichier

Fichier : Covid19_vaccination_twitter_analysis.py Projet : jinfeijoy/NLP

# In[48]:


test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True)
test.orig_text[5]


# In[49]:


test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30))
DataExploration.generate_word_cloud(list(test['explore_text']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10)
lda.lda_topics(lda_allbow)


# From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral.
# 
# In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc.

# In[76]:


canada_overtime = data[data.country.isin(['Canada'])].groupby(['date', 'sentiment']).agg(**{'tweets': ('id', 'count')}).reset_index().dropna()
fig = px.line(canada_overtime, x='date', y='tweets', color='sentiment',

Exemple #3

0

Afficher le fichier

Fichier : Basic_twitter_sentiment_analysis_smallDataset.py Projet : jinfeijoy/NLP

# print(data.clean_tweet[i])
print("-----------")
print(data.old_text[i])

# ## Explore Data
# We run Explore Data in sample dataset.

# In[15]:

pos_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'positive']['text'])
neu_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'neutral']['text'])
neg_tweet = list(x.split()
                 for x in data[data['new_sentiment'] == 'negative']['text'])
postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)

# In[16]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[17]:

DataExploration.generate_word_cloud(pos_tweet)

Exemple #4

0

Afficher le fichier

                                           tag_drop=[],
                                           nltk_stop=True,
                                           stop_word_list=[],
                                           remove_pattern=[],
                                           check_numbers=True,
                                           word_length=2,
                                           remove_consecutives=True)

# In[43]:

selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

DataExploration.generate_word_cloud(selected_tokens)

# ## Fit model based on title

# In[58]:

train_index, test_index = train_test_split(train.index,

Exemple #5

0

Afficher le fichier

                                   errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, 
                                   stop_word_list=[], remove_pattern = [],
                                   check_numbers=True, word_length=2, remove_consecutives=True)


# In[16]:


top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(data['token'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[17]:


top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[18]:


DataExploration.generate_word_cloud(data['token'].to_list())


# In[19]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)

Exemple #6

0

Afficher le fichier

i = 1
print(data.orig_text[i])
print("-----------")
print(data.text[i])
print("-----------")
print(data.explore_text[i])

# In[58]:

pos_tweet = list(data[data['compound'] > 0]['explore_text'])
neg_tweet = list(data[data['compound'] < 0]['explore_text'])
neu_tweet = list(data[data['compound'] == 0]['explore_text'])

# In[59]:

postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
neutop10tfidf = tfidf.get_top_n_tfidf_bow(neu_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)
print('top 30 neutual review tfidf', neutop10tfidf)

# In[40]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[38]:

Exemple #7

0

Afficher le fichier

processed_letter_df = pd.read_csv(os.path.join(data_path, 'processed_letter.csv'))
processed_letter_df['tokens'] = processed_letter_df.clean_letter.apply(lambda x: x.split(' '))


# In[26]:


top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(processed_letter_df['tokens'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[27]:


top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[28]:


DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())


# In[29]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)

Exemple #8

0

Afficher le fichier

    print('-------------------- Company --------------------')
    print(printdata.company_profile.item())
    print('-------------------- Job Description --------------------')
    print(printdata.description.item())
    print('-------------------- Requirements --------------------')
    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[5]:

profile_tokens = list(raw_data['profile_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[6]:

profile_tokens = list(raw_data['description_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[7]:

Exemple #9

0

Afficher le fichier

    stem_lemma='lemma',
    tag_drop=['V'],
    nltk_stop=True,
    stop_word_list=['movie', 'film', 'movies', 'films'],
    check_numbers=True,
    word_length=3,
    remove_consecutives=True)

# ## Task 2: Create a Term Document Matrix using TF-IDF

# During the Day 2 lab, we created a term-document matrix by simply counting the occurence of words in each document. Let's try using TF-IDF to turn our documents in vectors here.

# In[5]:

tfidf_value_data = tfidf.get_tfidf_dataframe(preprocessed_tokens)
to10_tfidf_bow = tfidf.get_top_n_tfidf_bow(preprocessed_tokens,
                                           top_n_tokens=10)
to10_tfidf_bow

# In[6]:

dictionary = DocVector.generate_corpus_dict(preprocessed_tokens,
                                            no_below=1,
                                            no_above=0.5,
                                            keep_n=100000)
bow_corpus = DocVector.create_document_vector(preprocessed_tokens, dictionary)
tfidf_trans = models.TfidfModel(bow_corpus)
my_df = DocVector.get_vocab_matrix(tfidf_trans[bow_corpus], dictionary)

# In[7]:

my_df.head(3)

Exemple #10

0

Afficher le fichier

Fichier : stock_news_sentiment_exploration.py Projet : jinfeijoy/NLP

                                   stem_lemma='lemma',
                                   tag_drop=['V'],
                                   nltk_stop=True,
                                   stop_word_list=[],
                                   remove_pattern=['http:', '#', '@'],
                                   check_numbers=True,
                                   word_length=2,
                                   remove_consecutives=True)

# In[23]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[20]:

DataExploration.generate_word_cloud(djia_tokens)

# In[27]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(nasdaq_tokens, topn=10)
]

Exemple #11

0

Afficher le fichier

    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[145]:

# raw_data['jd_tokens'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram = [1], lower_case=True,
#                                            deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma',
#                                            tag_drop = [], nltk_stop=True,
#                                            stop_word_list=[],
#                                            check_numbers=False, word_length=2, remove_consecutives=True)
fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens'])
print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(fraud_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[146]:

raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(),
                                            multi_gram=[2],
                                            lower_case=True,
                                            deacc=False,
                                            encoding='utf8',
                                            errors='strict',
                                            stem_lemma='lemma',
                                            tag_drop=[],