-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_srt.py
131 lines (106 loc) · 5.11 KB
/
twitter_srt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
import tweepy #https://github.com/tweepy/tweepy
#Twitter API credentials
consumer_key = "swFC3PYt7hh5F6EGtPg7ATnQx"
consumer_secret = "682qgTTOArAShjpTqfkjFvH27Zj4NEvhz1W0pButZmX1YdHT5g"
access_key = "570989918-VfjD9Tr4BDOLIMBlZZUQ9bzrQiE75JQbCrSPu5LK"
access_secret = "JeiuyQFkhltsA3npJYVMiMaevjY49wQtMxJprtf70E9HK"
def get_all_tweets(screen_name):
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
alltweets = []
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
alltweets.extend(new_tweets)
oldest = alltweets[-1].id-1
while len(new_tweets)>0:
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print ("...%s tweets downloaded so far" % (len(alltweets))) # tweet.get('user', {}).get('location', {})
outtweets = [[tweet.created_at,tweet.entities["hashtags"],tweet.entities["user_mentions"],tweet.favorite_count,
tweet.geo,tweet.id_str,tweet.lang,tweet.place,tweet.retweet_count,tweet.retweeted,tweet.source,tweet.text,
tweet._json["user"]["location"],tweet._json["user"]["name"],tweet._json["user"]["time_zone"],
tweet._json["user"]["utc_offset"]] for tweet in alltweets]
import pandas as pd
tweets_df = pd.DataFrame(columns = ["time","hashtags","user_mentions","favorite_count",
"geo","id_str","lang","place","retweet_count","retweeted","source",
"text","location","name","time_zone","utc_offset"])
tweets_df["time"] = pd.Series([str(i[0]) for i in outtweets])
tweets_df["hashtags"] = pd.Series([str(i[1]) for i in outtweets])
tweets_df["user_mentions"] = pd.Series([str(i[2]) for i in outtweets])
tweets_df["favorite_count"] = pd.Series([str(i[3]) for i in outtweets])
tweets_df["geo"] = pd.Series([str(i[4]) for i in outtweets])
tweets_df["id_str"] = pd.Series([str(i[5]) for i in outtweets])
tweets_df["lang"] = pd.Series([str(i[6]) for i in outtweets])
tweets_df["place"] = pd.Series([str(i[7]) for i in outtweets])
tweets_df["retweet_count"] = pd.Series([str(i[8]) for i in outtweets])
tweets_df["retweeted"] = pd.Series([str(i[9]) for i in outtweets])
tweets_df["source"] = pd.Series([str(i[10]) for i in outtweets])
tweets_df["text"] = pd.Series([str(i[11]) for i in outtweets])
tweets_df["location"] = pd.Series([str(i[12]) for i in outtweets])
tweets_df["name"] = pd.Series([str(i[13]) for i in outtweets])
tweets_df["time_zone"] = pd.Series([str(i[14]) for i in outtweets])
tweets_df["utc_offset"] = pd.Series([str(i[15]) for i in outtweets])
tweets_df.to_csv(screen_name+"_tweets.csv")
return tweets_df
srt = get_all_tweets("Elon Musk")
import pandas as pd
srt1 =srt.iloc[:,11]
#srt1 =pd.DataFrame(srt1)
srt1 = list(srt1)
srt_rev = ' '.join(srt1)
##Performing tokenization
from nltk.tokenize import sent_tokenize
token_sent = sent_tokenize(srt_rev)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wordnet = WordNetLemmatizer()
import re
filtered_sent=[]
for i in range(len(token_sent)):
review = re.sub("[^A-Za-z" "]+"," ",token_sent[i])
review = re.sub("[0-9" "]+"," ",token_sent[i])
review = review.lower()
review =review.split()
review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
filtered_sent.append(review)
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
text_tf = tf.fit_transform(filtered_sent)
feature_names = tf.get_feature_names()
dense = text_tf.todense()
denselist = dense.tolist()
df =pd.DataFrame(denselist, columns=feature_names)
#plotting wordcloud on TFIDF
from wordcloud import WordCloud
import matplotlib.pyplot as plt
cloud = ' '.join(df)
wordcloud= WordCloud(
background_color = 'black',
width = 1800,
height =1400).generate(cloud)
plt.imshow(wordcloud)
##Importing positive words to plot positive word cloud
with open("E:\\Assignment\\11) Text mining\\positive-words.txt","r") as pos:
poswords = pos.read().split("\n")
poswords =poswords[36:]
pos_words = ' '.join([w for w in df if w in poswords])
cloud_pos = WordCloud(
background_color = 'black',
width =1800,
height=1400).generate(pos_words)
plt.imshow(cloud_pos)
##Importing negative words to build negative word cloud
with open("E:\\Assignment\\11) Text mining\\negative-words.txt","r") as nos:
negwords = nos.read().split("\n")
negwords = negwords[37:]
neg_words = ' '.join([w for w in df if w in negwords])
cloud_neg = WordCloud(
background_color='black',
width =1800,
height =1400).generate(neg_words)
plt.imshow(cloud_neg)