/
tool_collect_tweets.py
executable file
·144 lines (124 loc) · 5.66 KB
/
tool_collect_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
# Sample script that collects tweets matching a string.
#
# This file is part of the Tweeples collection of scripts.
#
# Tweeples is free software: you can redistribute it and/or modify it
# under the terms of the BSD license. For the full terms of the license
# see the file `COPYING' in this directory.
'''Collect tweets matching a text pattern and store them
continuously in JSON-formatted lines of a local file.'''
__author__ = 'Giorgos Keramidas <gkeramidas@gmail.com>'
import argparse
import errno
import json
import os
import sys
import twitter
from twitter.api import TwitterHTTPError
from util import error, message, warning
def search(text, max_pages=10, results_per_page=100):
"""Generator for searching 'text' in Twitter content
Search the public Twitter timeline for tweets matching a 'text' string,
which can also be a hash tag, and yield a batch of matched tweets every
time we have some results.
Args:
text str, the text to search for in Twitter. This can
be a plain text string or a '#hashtag' to look
for tweets of this topic only.
max_pages int, maximum number of result 'pages' to obtain
from Twitter's backlog of archived tweets. When
not specified, default to 10 pages.
results_per_page int, maximum number of results per page to fetch
from Twitter's backlog of archived tweets. When
not specified, default to 100 tweets per page.
Returns:
An array of dicts. Every dict in the returned array is a 'result' from
twitter.Twitter.search and represents a single tweet.
"""
while True:
t = twitter.Twitter(domain="search.twitter.com")
for page in range(1, max_pages + 1):
yield t.search(q=text, rpp=results_per_page, page=page)['results']
def preload_tweets(filename):
"""Preload previously seen tweets from a text file.
Args:
filename str, Name of the file where we preload tweets from.
Returns:
A set() containing all the numeric 'id' attributes of tweets we have
already seen.
"""
if not filename:
return set()
try:
seen = set()
for id in (tweet['id'] for tweet in
(json.loads(line) for line in
file(filename, 'r').readlines())):
seen.add(id)
except Exception, e:
seen = set() # Avoid returning partial results on error
return seen
def streamsearch(ofile, text, max_pages=10, results_per_page=100):
"""Stream the results of searching for 'text' to the 'ofile' output file
Args:
ofile str, the name of a file where we will write any tweets
we find. Tweets are written in JSON format, with every
tweet being stored in a separate line as a Python dict.
text str, the text to search for in Twitter. This can
be a plain text string or a '#hashtag' to look
for tweets of this topic only.
max_pages int, maximum number of result 'pages' to obtain
from Twitter's backlog of archived tweets. When
not specified, default to 10 pages.
results_per_page int, maximum number of results per page to fetch
from Twitter's backlog of archived tweets. When
not specified, default to 100 tweets per page.
Returns:
None
"""
# Load the id of already seen tweets, if there are any.
ofilename = ofile or 'standard output'
seen = ofile and preload_tweets(ofile) or set()
if seen:
message('%d tweets preloaded from %s', len(seen), ofilename)
try:
ostream = ofile and file(ofile, 'a+') or sys.stdout
for matches in search(text, max_pages=max_pages,
results_per_page=results_per_page):
newmatches = 0
for tweet in matches:
(tid, tuser, text) = (tweet['id'], tweet['from_user'],
tweet['text'])
if not tid in seen:
newmatches += 1
seen.add(tid)
print >> ostream, json.dumps(tweet)
if newmatches > 0:
message('%d new tweets logged at %s', newmatches, ofilename)
ostream.close()
except IOError, e:
if ostream and ostream != sys.stdout:
ostream.close()
warning('Error writing at file "%s". %s', ofilename, e)
if __name__ == '__main__':
json_filename = None # Where to store matching tweets
lookup_text = None # Text to search for
# Parse command-line args for output file name.
parser = argparse.ArgumentParser(description=(
'Collect tweets matching a text pattern and store them'
'continuously in JSON-formatted lines of a local file.'))
parser.add_argument('-o', '--output', metavar='FILE', type=str,
default=None, help='output file name')
parser.add_argument('TEXT', nargs='+', type=str, default=None,
help='text to search for in tweet content')
args = parser.parse_args()
json_filename = args.output # Where to store matching tweets
lookup_text = ' '.join(args.TEXT) # Text to search for
# Keep searching for tweets, until manually interrupted.
while True:
try:
streamsearch(json_filename, lookup_text)
except TwitterHTTPError, e:
warning('Skipping HTTP error %s [...]', str(e).split('\n')[0])
pass