This repository has been archived by the owner on Jul 11, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update.py
executable file
·154 lines (129 loc) · 4.37 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# Config
import logging
import requests
import re
import os
import readability
from readability.api import ResponseError
from BeautifulSoup import BeautifulSoup as Soup
import twitter
from config import *
urlfinder = re.compile(r"(https?://[^ )]+)")
logger = logging.getLogger(__file__)
logger.setLevel(logging.DEBUG)
filepath = os.path.join(os.path.dirname(__file__), 'readability.log')
fh = logging.FileHandler(filepath)
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
def is_url_an_article(url):
"""
Test heuristically if the passed URL is an article
"""
if url.endswith('.jpg'):
return False
banned_words = ('vimeo', 'youtube')
for word in banned_words:
if word in url:
return False
return True
def get_article_urls_from_twitter_favourites(username, n=20):
"""
Return a list of article URLs extracted from a user's twitter favourites
"""
logger.info("Fetching articles from Twitter favourite for %s", username)
api = twitter.Api(
consumer_key=TWITTER_CONSUMER_KEY,
consumer_secret=TWITTER_CONSUMER_SECRET,
access_token_key=TWITTER_ACCESS_TOKEN_KEY,
access_token_secret=TWITTER_ACCESS_TOKEN_SECRET,
)
favourites = api.GetFavorites()
urls = []
for tweet in favourites:
text = tweet.text
# Look for a link
match = urlfinder.search(text)
if not match:
continue
# Check link is an article
redirect_url = match.groups()[0]
url_resp = requests.get(redirect_url)
if 'text/html' not in url_resp.headers['content-type']:
continue
url = url_resp.url
if is_url_an_article(url):
urls.append(url_resp.url)
logger.info("Found %d articles", len(urls))
return urls
def get_top_hacker_news_articles(n=5):
logger.info("Fetching top Hacker news articles")
source_url = 'http://news.ycombinator.com/best'
soup = Soup(requests.get(source_url).content)
urls = []
for td in soup('td', attrs={'class': 'title'}):
anchor = td.find('a')
if not anchor:
continue
urls.append(anchor['href'])
if len(urls) == n:
break
return urls
def get_economist_articles(num=10):
logger.info("Fetching top Economist articles")
source_url = 'http://www.economist.com'
soup = Soup(requests.get(source_url).content)
ul = soup.find('ul', id='recommended-list')
urls = []
for anchor in ul.findAll('a'):
urls.append(source_url + anchor['href'])
return urls[:num]
def get_atlantic_articles(num=10):
logger.info("Fetching top Atlantic articles")
source_url = 'http://www.theatlantic.com'
soup = Soup(requests.get(source_url).content)
div = soup.find('div', id='mostPopular')
urls = []
for anchor in div.findAll('a'):
urls.append(source_url + anchor['href'])
return urls[:num]
def main():
token = readability.xauth(
CONSUMER_KEY, CONSUMER_SECRET, USERNAME, PASSWORD)
rdd = readability.oauth(
CONSUMER_KEY, CONSUMER_SECRET, token=token)
user = rdd.get_me()
logger.info("Updating readability library")
library_urls = [u.article.url for u in user.bookmarks()]
logger.info("Found %d articles in library", len(library_urls))
# Fetch URLs
urls = get_article_urls_from_twitter_favourites(TWITTER_USERNAME)
urls += get_top_hacker_news_articles(5)
urls += get_economist_articles(5)
urls += get_atlantic_articles(2) # Only 3 as it's too noisy
logger.info("Found %d articles to add", len(urls))
num_dupes = num_new = num_errors = 0
for url in urls:
if url in library_urls:
num_dupes += 1
else:
logger.info("Adding %s", url)
try:
rdd.add_bookmark(url)
except ResponseError:
num_errors += 1
except Exception, e:
logger.error("Unexpected exception: %s", e)
num_errors += 1
else:
num_new += 1
logger.info("Added %d new articles, found %d dupes, %d errors",
num_new, num_dupes, num_errors)
if __name__ == '__main__':
main()