-
Notifications
You must be signed in to change notification settings - Fork 0
/
rss-from-webpage.py
166 lines (135 loc) · 6.16 KB
/
rss-from-webpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Turn a web page into an RSS feed.
Assumes a CSS-selector identifiable that contains children
and currently hard codes how each article is extracted
(title between A tags, link in the A href,
description = a single P within the child. or something.)
"""
# TODO: Output on the web somewhere
# TODO: Test against RSS validator
# TODO: don't hard code item extraction selectors - pass in somehow
# TODO: automatically get title etc from page
# TODO: but allow overrides e.g. title
# TODO: optionally get content from each link target
# TODO: optionally take RSS with summaries, add full content (selector specified, finding content too hard)
# TODO: get dates on items from...HTTP?
# TODO: get dates on items from selector (parsing a date string)
# TODO: get dates on items from selector (on top page or article page)
# TODO: other attributes? of feed / items?
# TODO: be a good citizen
# TODO: - check if there's already a feed and say so
# TODO: - keep local cache (just the RSS? serialised as JSON? pickle?)
# TODO: - don't scrape top page if HTTP says cached / date unchanged
# TODO: - don't scrape article page if HTTP says date unchanged
# TODO: - don't scrape item if date (from selector / date string parsing) unchanged
# TODO: better docstrings
# TODO: allow to be run as a web service. fire only when output url is hit/run
# TODO: a repository / dictionary of feed_settings definitions
# TODO: a shared / master repository of feed_settings definitions
# TODO: define a pattern common to multiple pages on site, allow choosing by name
# TODO: - e.g. columnists listed on page, link to each of their names
# TODO: take feed definition name as URL parameter
# TODO: handle routes so /feed/feed_name/ works
# TODO: or take all feed settings as URL parameters?
# TODO: requirements.txt
# TODO: any kind of unit test :|
import collections
import requests
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator
def get_articles_from_html(container_html):
"""
Takes an HTML string and extracts children according to
Returns a set of namedtuples with link, title and description
"""
feed_article = collections.namedtuple('feed_article',
{'link', 'title', 'description', })
articles = set()
for child in container_html:
# TODO pass in criteria for choosing item, don't hard code
link = child.find('a')['href'] # TODO hardcoded
title = child.find('a').string # TODO hardcoded
description = child.find('p').string # TODO hardcoded
articles.add(
feed_article(link=link, title=title, description=description))
return articles
def generate_rss_from_articles(feed_settings, articles):
"""
Creates a FeedGenerator feed from a set of feed_entries.
:param feed_settings: a feed_settings object containing
:param articles:
:return:
"""
# create the feed
output_feed = FeedGenerator()
# add metadata to the feed
# TODO this feels like it can be done without output_rss on every line but...Python newbie
output_feed.title(feed_settings.title)
output_feed.author(feed_settings.author)
output_feed.link(href=feed_settings.source_page_url, rel='alternate')
output_feed.link(href=feed_settings.output_url, rel='self')
output_feed.logo(feed_settings.logo_img_url)
output_feed.subtitle(feed_settings.subtitle)
output_feed.language(feed_settings.language)
# output_rss.id(UM_SOMETHING)
# add each feed item
for article in articles:
feed_entry_added = output_feed.add_entry()
feed_entry_added.id(article.link) # ATOM
# guid for RSS?
feed_entry_added.link(href=article.link, rel='alternate') # ATOM
feed_entry_added.title(article.title)
feed_entry_added.description(article.description)
# feed_entry_added.link(article.link)
return output_feed
def output_rss(rss, filename):
"""
Sends RSS to a file, and stdout if debugging
:param feed_xml: valid RSS XML
:return: none
"""
if debug:
print(rss.rss_str(pretty=True))
rss.rss_file(filename)
def rss_from_webpage(feed_settings):
"""
TODO docstring
:param feed_settings:
:return:
"""
source_page_html = requests.get(feed_settings.source_page_url).content
soup = BeautifulSoup(source_page_html, 'html.parser')
container_html = soup.select(feed_settings.container_CSS_selector)
articles = get_articles_from_html(container_html)
rss = generate_rss_from_articles(feed_settings, articles)
return rss
def main():
"""
TODO docstring
:return:
"""
# TODO: feels like this doesn't go in main. what goes in main normally? do I need a function with the same name as the module?
feed_settings = collections.namedtuple('feed_settings',
{'source_page_url',
'container_CSS_selector',
'output_file', 'output_url',
'title', 'subtitle', 'author',
'logo_img_url',
'language', })
# TODO: feels like this should be only used as debug / when nothing passed in given it's default/test values
this_feed_settings = feed_settings(
source_page_url='http://www.smh.com.au/comment/by/Annabel-Crabb-hvecc',
container_CSS_selector='main.main div.story__wof',
output_file='annabel-crabb-smh.atom',
output_url='https://www.lukemorey.com/annabel-crabb-smh.atom', # TODO this should be base URL + filename
title='Annabel Crabb SMH',
subtitle='Annabel Crabb is a regular columnist, TV host and leading political commentator.',
author={'name': 'Annabel Crabb'},
# there are other field options here like email...
logo_img_url='http://www.smh.com.au/content/dam/images/h/v/e/c/d/image.imgtype.columnistThumbnail.90x90.png/1408400781813.png',
language='en')
rss = rss_from_webpage(this_feed_settings)
output_rss(rss, this_feed_settings.output_file)
debug = True
if __name__ == "__main__":
main()