/
geo_twitter.py
199 lines (153 loc) · 6.96 KB
/
geo_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding: utf-8 -*-
"""
Created on Mon May 19 09:23:32 2014
Usage: python geo_twitter.py --query term --number of tweets
Description: Collects tweets from the Twitter Streaming API, filtering for
those with geolocation metadata and a query term. The user specifies the query
term and the number of tweets to collect. The data, along with information on
whether the query term was mentioned as a user account, a hashtag, both, or
neither, are aggregated by US state.
Special dependencies: tweepy (https://github.com/tweepy/tweepy) and
pygeocoder (https://bitbucket.org/xster/pygeocoder/wiki/Home)
@author: Juan Manuel Contreras (juan.manuel.contreras.87@gmail.com)
"""
# Import modules
from re import search
from time import sleep
from random import random
from sys import argv, stderr
from pandas import DataFrame
from csv import writer, reader
from pygeocoder import Geocoder
from tweepy import OAuthHandler, StreamListener, streaming
def stream_geo_tweets(file_name, query_term, n_total_tweet):
'''Collect tweets with a query term and geolocation metadata'''
# Authorize connection to Twitter
auth = OAuthHandler('CONSUMER_KEY', 'CONSUMER_SECRET')
auth.set_access_token('ACCESS_TOKEN', 'ACCESS_TOKEN_SECRET')
# Open the file to which tweets will be written
with open(file_name, 'wb') as f:
writer(f).writerow(['tweet', 'latitude', 'longitude'])
# Define a custom class to listen to Twitter's stream
class CustomStreamListener(StreamListener):
def __init__(self, api=None):
super(CustomStreamListener, self).__init__()
self.n_tweet = 0
def on_status(self, status):
try:
# If a tweet has coordinates
if status.geo is not None:
latitude, longitude = status.geo['coordinates']
# Otherwise, if a tweet was written by a user with coordinates
elif status.place is not None and \
getattr(status, 'coordinates'):
longitude, latitude = status.place.coordinates[0][0]
# If a tweet is associated with coordinates, then write it
if 'latitude' in vars() and 'longitude' in vars():
with open(file_name, 'ab') as f:
row = [status.text.encode('UTF8'), latitude, longitude]
writer(f).writerow(row)
self.n_tweet += 1
if self.n_tweet == n_total_tweet:
return False
except Exception, e:
print stderr, 'Encountered exception:', e
def on_error(self, status_code):
print stderr, 'Encountered error with status code:', status_code
return True
def on_timeout(self):
print stderr, 'Timeout...'
return True
# Listen to the Twitter stream, filtering by query terms
streaming_api = streaming.Stream(auth, CustomStreamListener(), timeout=60)
streaming_api.filter(track=[query_term])
def analyze_tweets(file_name, query_term, n_total_tweet):
'''Determine the US state to which each tweet belongs and whether the query
term mentioned refers to a user account, a hashtag, both, or neither'''
# Initialize an empty DataFrame
df = DataFrame({'state': [None] * n_total_tweet,
'u_o_h': [None] * n_total_tweet})
# Iterate through all tweets
with open(file_name, 'rb') as f:
f.next()
for i, row in enumerate(reader(f)):
# Extract tweet and geo data
tweet = row[0]
latitude, longitude = [float(j) for j in row[1:]]
# Tag state and determine user or hashtag
df['state'][i] = reverse_geocode(latitude, longitude)
df['u_o_h'][i] = user_or_hashtag(tweet, query_term)
# Sleep for at least 200ms to avoid upsetting the Google Maps API
sleep(0.2 + random())
# Return DataFrame
return df
def reverse_geocode(latitude, longitude):
'''Perform reverse geocoding to identify each tweet's US state, if any'''
# Perform reverse geocoding
try:
geo_obj = Geocoder.reverse_geocode(latitude, longitude)
except Exception, e:
print stderr, 'Encountered exception:', e
return None
# Declare address object
address = geo_obj.data[0]['address_components']
# Iterate through the elements of the address object
for i in xrange(len(address)):
# Return the current element if it contains the state abbreviation
if 'administrative_area_level_1' in address[i]['types']:
return address[i]['short_name'].encode('UTF-8')
def user_or_hashtag(tweet, query_term):
'''Determine whether the query term mentioned refers to a user account, a
hashtag, both, or neither'''
# Determine whether the mention is user, hashtag, or none
if search('@' + query_term, tweet.lower()):
return 'user'
elif search('#' + query_term, tweet.lower()):
return 'hash'
else:
return 'none'
def agg_by_state(df):
'''Aggregate data by US state, summing all relevant metrics'''
# Define lambda functions for aggregation
count_user = lambda x: sum(x == 'user')
count_hash = lambda x: sum(x == 'hash')
count_none = lambda x: sum(x == 'none')
count_user_hash = lambda x: (count_user(x) / count_hash(x)) \
if count_hash(x) > 0 else 0
# Create an aggregation dictionary
agg_dict = {'count': len, 'n_user': count_user, 'n_hash': count_hash,
'n_none': count_none, 'user_hash': count_user_hash}
# Perform aggregation by state
grouped = df.groupby(by='state', as_index=False)
df = grouped['u_o_h'].agg(agg_dict)
# Load state data
with open('J:\WDPRO\BPM\us_states.csv', 'r') as f:
states = {}
for abbrev, name in reader(f):
states[abbrev] = name
states = DataFrame(data=states.values(), index=states.keys())
# Restrict results to US states
df = df[df.state.isin(states.index)]
# Join the full state name
df = df.join(states, on='state')
df.rename(columns={0: 'state_name'}, inplace=True)
df['state_name'] = [i.lower() for i in df['state_name']]
# Rank the states
df['count_rank'] = df['count'].rank(ascending=False)
# Return DataFrame
return df
def main(argv):
# Declarations
file_name = 'geo_twitter.csv'
query_term = argv[1]
n_total_tweet = int(argv[2])
# Stream tweets with coordinates
stream_geo_tweets(file_name, query_term, n_total_tweet)
# Analyze tweets (geocode and identify hashtag/user references)
df = analyze_tweets(file_name, query_term, n_total_tweet)
# Aggregate results by state
df = agg_by_state(df)
# Save results to a CSV file
df.to_csv(path_or_buf='geo_twitter_stats.csv', index=None)
if __name__ == '__main__':
main(argv)