-
Notifications
You must be signed in to change notification settings - Fork 0
/
TwitterCrawler.py
114 lines (89 loc) · 3.58 KB
/
TwitterCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import time
import twitter
import Queue
import Q2logging
import threading
import os
"""
to do:
If TRY in run(self) returns false, delete the file from the media folder.
"""
class feedGetter(threading.Thread):
def __init__(self, user, api):
"""
"""
threading.Thread.__init__(self)
self.user = user
self.api = api
def run(self):
"""
Scrapes twitter for (count) number of tweets. It likely won't return that many. They are returned as classes.
"""
try: # TRY added to prevent breakage on twitter handles that have disappeared or have been deleted
statuses = self.api.GetUserTimeline(self.user, count=100, exclude_replies=True)
self.twitterSave(statuses)
logger.write_line("Init thread to get tweets for %s " % self.user)
except:
logger.write_line("Twitter object failed for %s" % self.user)
pass
return None
# twitter.api returns the data as a class. twitterSave strips out
# the tweet, converts it to UTF-8 and saves it to a file.
def twitterSave(self, statuses):
fout = open("twitterFeeds\\" + self.user + ".twitter", 'w')
for status in statuses:
text = status.text
text = text.encode("utf-8")
text = text.replace("\n", "")
fout.write(text)
fout.write("\n")
fout.close()
logger.write_line("twitterFeeds file saved for %s" % self.user)
#==================================================================================================
logger = Q2logging.out_file_instance("logs\\twitterCrawler\\twitterCrawler")
def main():
workQueue = Queue.Queue()
twitThread = twitterThread(workQueue)
twitThread.start()
class twitterThread(threading.Thread):
def __init__(self, newNamesQ):
threading.Thread.__init__(self)
self.newNamesQ = newNamesQ
self.oldNamesQ = Queue.Queue()
self.api = twitter.Api()
def run(self):
"""
Looks at the two queues, pulls from newNamesQ first if available, and gets the twitter feed for that name.
If there's nothing in the queues it looks in the "twitterFeeds" folder and adds those names to the newNamesQ queue.
Each name gets spun off on it's own thread to get the tweets, new threads are spun off at no less than
45 second intervals.
"""
startTime = time.time()
while 1:
try:
user = self.newNamesQ.get_nowait()
logger.write_line("Looking for name in newNamesQ")
except Queue.Empty:
user = None
logger.write_line("No name in newNamesQ")
if user == None:
try:
user = self.oldNamesQ.get_nowait()
logger.write_line("Looking for name in oldNamesQ")
except Queue.Empty:
user = None
logger.write_line("No name in oldNamesQ")
if user != None:
self.oldNamesQ.put(user)
logger.write_line("Appended %s to oldNamesQ" % user)
feedGetterThread = feedGetter(user, self.api)
feedGetterThread.start()
logger.write_line("Sent name %s to feedGetter" % user)
finishTime = time.time()
loopTime = finishTime - startTime
waitTime = 45 - loopTime
waitTime = max(0, waitTime)
time.sleep(waitTime)
startTime = time.time()
if __name__ == "__main__":
main()