/
twitter2rss.py
182 lines (152 loc) · 7.01 KB
/
twitter2rss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pickle, tweepy, urllib, pytz, logging, os, time
from datetime import datetime, timedelta
from feedgen.feed import FeedGenerator
from settings import *
from readability.readability import Document
if using_readability_api :
import json
# Adjust the time zone to locale timezone. May only work in unix systems.
# If so, remove/comment the next three lines
os.environ['TZ'] = locale
time.tzset()
time.tzname
#Setting up the logger
logger = logging.getLogger('T2R')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(log_file)
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(lineno)d - %(message)s', '%Y-%m-%d %I:%M:%S %p %Z')
fh.setFormatter(formatter)
logger.addHandler(fh)
def load_buffer(fileName):
try:
buffered = pickle.load( open( fileName, "rb" ) )
except Exception, e:
logger.error(e)
logger.warn("[Buffer_Load_ERR]: Unable to load tweets from buffered. It's possible that the script is running for the first time. In that case it will be generated at the end of the script")
buffered =[]
else:
logger.info("Buffer loaded successfully with " + str(len(buffered)) + " tweets.")
return buffered
def get_lastID(buffered):
id_list =[]
for tweet in buffered:
id_list.append(tweet['id'])
return max(id_list)
def parse_twitter(buffered, keys):
try:
auth = tweepy.OAuthHandler(twitter_keys['consumer_key'], twitter_keys['consumer_secret'])
auth.set_access_token(twitter_keys['access_token'], twitter_keys['access_token_secret'])
api = tweepy.API(auth)
# If the authentication was successful, you should
# see the name of the account print out
logger.debug('Fetching feeds for twitter user: ' + api.me().name)
except Exception, e:
logger.error(e)
logger.critical('Unable to log in to twitter. Quitting')
exit()
try: # In case the script is running for the first time. fetch the
lastID = get_lastID(buffered)
except Exception, e:
logger.warn('Can not retreive last ID, retreiving last 200 tweets instead')
pub = api.home_timeline(count=200) #fetch feeds without since ID
for i in pub:
parse_tweet(i)
else:
logger.debug("Will fetch tweets since tweet ID " + str(lastID))
for i in tweepy.Cursor(api.home_timeline, since_id=str(lastID)).items():
parse_tweet(i)
# without cursor code: pub = api.home_timeline(count=200, since_id=str(lastID) ) #fetch feeds from the last ID
return buffered
def parse_tweet(i):
tweet ={}
s = i.entities['urls']
if len(s) > 0: # Check if tweet has a url
urls = s[0]
tweet['url'] = urls['expanded_url']
try:
if using_readability_api:
api_url = 'https://www.readability.com/api/content/v1/parser?url=' + tweet['url'] + '&token=' + readability_api_token
readable = json.loads(urllib.urlopen(api_url).read())
tweet['readable_title'] = readable['title']
tweet['readable_article'] = readable['content']
else:
html = urllib.urlopen(tweet['url']).read()
tweet['readable_title'] = Document(html).title()
tweet['readable_article'] = Document(html).summary()
except Exception, e:
logger.error(e)
else:
tweet['text'] = i.text.encode('utf-8')
tweet['screen_name'] = i.user.screen_name
tweet['profile_image_url'] = i.user.profile_image_url
tweet['user_name'] = i.user.name
tweet['user_url'] = i.user.url
tweet['id'] = i.id
tweet['id_str'] = i.id_str
tweet['created_at'] = i.created_at
tweet['retweets'] = i.retweet_count
try:
article_header = '<div><img src="'.decode('utf-8') + tweet['profile_image_url'].decode('utf-8') + '" alt="'.decode('utf-8') + tweet['screen_name'].decode('utf-8') + '" /><p><strong>'.decode('utf-8') + tweet['user_name'].decode('utf-8') + ': </strong>'.decode('utf-8') + tweet['text'].decode('utf-8') +'</p></div><hr />'.decode('utf-8')
except Exception, e:
logger.error(e)
else:
tweet['readable_article'] = article_header + tweet['readable_article']
try:
logger.info(tweet['id_str'].decode('utf-8', 'replace') + ' : @' + tweet['screen_name'].decode('utf-8', 'replace') + ' : ' + tweet['text'].decode('utf-8', 'replace'))
except Exception, e:
logger.error(e)
buffered.insert(0, tweet)
# del buffered[feed_item_limit:] #pruning the feed to a maximum number of feeds.
pickle.dump( buffered, open( buffer_file, "wb" ) ) # Temporary dump will prune and dup later
def prune(buffered, item_limit, old):
for i in reversed(buffered): # Need to reverse the list as python does not
delta = datetime.now(pytz.utc) - pytz.utc.localize(i['created_at'])
if delta > timedelta(days=old):
logger.info('Purging an old tweet - ' + i['screen_name'] + " " + i['text'].decode('utf-8'))
buffered.remove(i)
sorted_buffer = sorted(buffered, key=lambda k: (k['retweets'], k['id']))
sorted_buffer.reverse()
del sorted_buffer[item_limit:] #pruning the feed to a maximum number of feeds.
pickle.dump( buffered, open( buffer_file, "wb" ) )
return sorted_buffer
def generateFeeds(buffered, meta):
utc = pytz.utc
fg = FeedGenerator()
fg.id(meta['id'])
fg.title(meta['title'])
fg.author(meta['author'])
fg.subtitle(meta['subtitle'])
fg.link( href=meta['link'], rel='self' )
fg.language(meta['language'])
for tweet in buffered:
fe = fg.add_entry()
fe.id(tweet['url'].decode('utf-8'))
fe.published(utc.localize(tweet['created_at']).astimezone(pytz.timezone(locale)))
#fe.guid(tweet['url'].decode('utf-8'))
fe.link(href=tweet['url'].decode('utf-8'), rel='alternate')
fe.title(tweet['readable_title'])
fe.description(tweet['readable_article'])
try:
fe.author({'name': '', 'email':tweet['user_name'].decode('utf-8') + ": " + tweet['text'].decode('utf-8')})
except Exception, e:
logger.error(e)
fe.author({'name': 'a', 'email':'a@a.com'})
return fg
def write_rss(feedGenerator, fileName):
logger.debug('Writing RSS file')
try:
feedGenerator.rss_file(rss_file) # Write the RSS feed to a file
except Exception, e:
logger.error(e)
else:
logger.info('RSS file ' + fileName + " written successfully.")
logger.info('Session Started')
buffered = load_buffer(buffer_file)
parsed = parse_twitter(buffered, twitter_keys)
parsed = prune(parsed, feed_item_limit, old)
feed = generateFeeds(parsed, meta)
write_rss(feed, rss_file)
logger.info('Session Finished\n\n')