forked from nicor88/twitter-to-kinesis
/
send_tweets_to_kinesis.py
136 lines (112 loc) · 5.43 KB
/
send_tweets_to_kinesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
""" Send tweets from a twitter stream to Kinesis
Examples
--------
>>> TweetsCollector.run(stream_name='DevStream', producer='TestProducer', keywords=['python'])
"""
import datetime as dt
import json
import logging
import os
import boto3
from botocore.client import Config
from dateutil.parser import parse
import pytz
import ruamel_yaml as yaml
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import settings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('sender')
class TweetsCollector:
def __init__(self, *, stream_name, producer, keywords):
self.stream_name = stream_name
self.producer = producer
self.keywords = keywords
configs = self.configs_loader()
twitter_auth = configs['twitter']
# set twitter auth
self.auth = OAuthHandler(twitter_auth['consumer_key'], twitter_auth['consumer_secret'])
self.auth.set_access_token(twitter_auth['access_token'], twitter_auth['access_secret'])
@classmethod
def run(cls, *, stream_name, producer, keywords):
c = cls(stream_name=stream_name, producer=producer, keywords=keywords)
return c.get_tweets()
def get_tweets(self):
twitter_stream = Stream(self.auth, SendTweetsToKinesis(stream_name=self.stream_name,
producer=self.producer,
keywords=self.keywords))
logger.info(self.keywords)
twitter_stream.filter(track=self.keywords)
return self
@staticmethod
def configs_loader():
settings_path = os.path.realpath(os.path.dirname(settings.__file__))
configs_file = os.path.join(settings_path, 'configs.yml')
try:
with open(configs_file, 'r') as f:
configs = yaml.load(f)
return configs
except IOError:
logger.error('No configs.yml found')
class SendTweetsToKinesis(StreamListener):
def __init__(self, stream_name, producer, keywords):
super(StreamListener, self).__init__()
self.kinesis = boto3.client('kinesis', config=Config(connect_timeout=1000))
self.stream_name = stream_name
self.producer = producer
self.keywords = keywords
def on_data(self, data):
tweet = json.loads(data)
tweet_to_send = self.create_tweet_for_kinesis(name='twitter', tweet=tweet,
keywords=self.keywords,
producer=self.producer)
logger.info(tweet_to_send)
res = self.put_tweet_to_kinesis(stream_name=self.stream_name, tweet=tweet_to_send)
logger.info(res)
def on_error(self, status):
logger.error(status)
return True
@staticmethod
def create_tweet_for_kinesis(*, tweet, name='twitter', producer='stream_to_stream', keywords):
def get_user_created(user_created, time_zone):
matched_tz = [a for a in set(pytz.all_timezones_set) if time_zone in a]
parsed_time = parse(user_created)
parsed_time = parsed_time.replace(tzinfo=None)
if len(matched_tz) > 0:
user_created_tz = pytz.timezone(matched_tz[0])
user_created_time = user_created_tz.localize(parsed_time, is_dst=None)
else:
user_created_time = parsed_time.isoformat()
return user_created_time
def __clean_tweet(tweet_to_clean):
attrs = ['created_at', 'lang', 'geo', 'coordinates', 'place', 'retweeted', 'source',
'text', 'timestamp_ms']
user_attrs = ['name', 'screen_name', 'location', 'url', 'description',
'followers_count', 'created_at', 'utc_offset', 'time_zone', 'lang']
clean = {a: tweet_to_clean[a] for a in attrs}
# clean['created_at'] = parse(tweet_to_clean['created_at']).replace(tzinfo=None)
created_at = dt.datetime.fromtimestamp(int(clean['timestamp_ms'])/1000)
# setup UTC timezone, needed if the producer is not UTC time
created_at = created_at.astimezone(pytz.utc)
clean['created_at'] = created_at.isoformat()
clean['user'] = {a: tweet_to_clean['user'][a] for a in user_attrs}
clean['user']['created_at'] = get_user_created(clean['user']['created_at'],
clean['user']['time_zone'])
logger.debug(f'User created time {clean["user"]["created_at"]}')
clean['hashtags'] = [el['text'] for el in tweet_to_clean['entities']['hashtags']]
return clean
record = __clean_tweet(tweet)
record['name'] = name
record['meta'] = {'created_at': dt.datetime.utcnow().isoformat(), 'producer': producer,
'keywords': ','.join(keywords)}
if 'created_at' not in record.keys():
record['created_at'] = record['meta']['created_at']
return record
def put_tweet_to_kinesis(self, *, stream_name, tweet, partition_key='created_at'):
res = self.kinesis.put_record(StreamName=stream_name,
Data=json.dumps(tweet),
PartitionKey=partition_key)
return res
if __name__ == '__main__':
TweetsCollector.run(stream_name='DevStream', producer='TestProducer', keywords=['python'])