/
py-twitter-parser.py
180 lines (145 loc) · 6.55 KB
/
py-twitter-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import csv
import json
import time
from tweepy import Cursor
from tweepy.api import API
from tweepy.auth import OAuthHandler
from tweepy.streaming import Stream, StreamListener
# Variables that contains the user credentials to access Twitter API
access_token = ""
access_token_secret = ""
consumer_key = ""
consumer_secret = ""
# Variables that contains the Files path
tweets_data_path = 'abcd.json'
users_tweets_path = 'users_tweets.csv'
users_friends_path = 'users_friends.csv'
class GetTwitterData():
def __init__(self, auth):
self.auth = auth
self.api = API(self.auth)
def get_all_tweets(self, screen_name, tweet_count):
# initialize a list to hold all the tweepy Tweets
alltweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = self.api.user_timeline(screen_name=screen_name, count=tweet_count)
# save most recent tweets
alltweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "getting tweets before %s" % (oldest)
# all subsiquent requests use the max_id param to prevent duplicates
new_tweets = self.api.user_timeline(screen_name=screen_name, count=tweet_count, max_id=oldest)
# save most recent tweets
alltweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
# transform the tweepy tweets into a 2D array that will populate the csv
out_tweets = [[screen_name, tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
# write the csv
with open(users_tweets_path, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["screen_name", "id", "created_at", "text"])
writer.writerows(out_tweets)
pass
def new_get_all_tweets(self, screen_name, tweet_count):
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = self.api.user_timeline(screen_name=screen_name, count=tweet_count)
# transform the tweepy tweets into a 2D array that will populate the csv
out_tweets = [[screen_name, tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in new_tweets]
# write the csv
with open(users_tweets_path, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["screen_name", "id", "created_at", "text"])
writer.writerows(out_tweets)
def find_friends(self, screen_name):
print "screen_name: " + screen_name
# page = self.api.followers_ids(screen_name=screen_name)
for id in Cursor(self.api.followers_ids, screen_name=screen_name,count=50).pages():
print id
print "ids are: " + str(len(id))
# if (len(id) > 90):
# array_offset = (len(id) % 90)
# friends_list=[]
# for x in range(1, array_offset):
# print "cutted id is:"
# print id[((x - 1) * 90):(x * 90)]
# friends = [user.screen_name for user in self.api.lookup_users(user_ids=str(id[((x - 1) * 90):(x * 90)]))]
# friends_list.extend(friends)
#
# else:
# friends_list = [user.screen_name for user in self.api.lookup_users(user_ids=id)]
friends_list = [user.screen_name for user in self.api.lookup_users(user_ids=id)]
print "list of users\n"
print friends_list
friends_list_output = [[screen_name, id[indx], friend]
for indx, friend in enumerate(friends_list)]
print friends_list_output
with open(users_friends_path, 'ab') as f:
writer = csv.writer(f)
writer.writerow(["screen_name", "id", "friends"])
writer.writerows(friends_list_output)
time.sleep(1)
def readfile(self):
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
counter = 1
screen_names = []
for tweet in tweets_data:
screen_names.extend(tweet['user']['screen_name'])
try:
self.new_get_all_tweets(tweet['user']['screen_name'], 5)
except Exception, e:
print "error:\n"
print str(e)
try:
print tweet['user']['screen_name']
self.find_friends(tweet['user']['screen_name'])
except Exception, e:
print "fail:\n"
print str(e)
print counter
# print 'tweet:' + tweet['text'] + "\n"
# print 'user name:' + tweet['user']['name'] + "\n"
# print 'user id:' + str(tweet['user']['id_str']) + "\n"
# print "\nuser is flowing \n"
# print 'user name:' + tweet['user']['name']
#
# print "\n>>>>>>>>>>>>\n"
counter = counter + 1
# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def __init__(self, time_limit=60):
self.start_time = time.time()
self.limit = time_limit
self.saveFile = open(tweets_data_path, 'a')
super(StdOutListener, self).__init__()
def on_data(self, data):
if (time.time() - self.start_time) < self.limit:
self.saveFile.write(data)
self.saveFile.write('\n')
return True
else:
self.saveFile.close()
return False
def on_error(self, status):
print status
if __name__ == '__main__':
# This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
tw_getter = GetTwitterData(auth)
stream.filter(track=['#'])
tw_getter.readfile()