-
Notifications
You must be signed in to change notification settings - Fork 2
/
builder.py
92 lines (79 loc) · 3.62 KB
/
builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# coding: utf-8
import time, argparse, sys, os
import feedparser, requests
import urllib.parse
from bs4 import BeautifulSoup, Tag
try:
from bs4 import FeatureNotFound
except ImportError:
FeatureNotFound = ValueError
def get_cmdline_args():
argparser = argparse.ArgumentParser()
argparser.add_argument('url', help = 'URL to get the list of articles from')
argparser.add_argument('list_selector', help = 'CSS selector to retreive items URLs (<a> tags)')
argparser.add_argument('item_selector', help = 'CSS selector used to extract the relevant content from the URL the previous selector returned.')
argparser.add_argument('output', help = 'Path of the resulting RSS file. Use "-" for stdout')
argparser.add_argument('-p', '--pretty', action = 'store_true', help = 'Specify that the output should be prettyfied')
argparser.add_argument('--ignored-query-params', nargs = '*', default = [], help = 'Query parameters to remove when generating the links')
return argparser.parse_args()
def new_tag(tag, string):
t = Tag(name = tag)
t.string = string
return t
def build_rss(url, list_selector, item_selector, ignored_qp, output, pretty = False):
try:
soup = BeautifulSoup('<rss version="2.0" />', 'xml')
rss = soup.rss
has_lxml = True
except FeatureNotFound:
rss = BeautifulSoup('<rss version="2.0" />').rss
has_lxml = False
r = requests.get(url, headers={"User-Agent": "RSS Builder"})
list_html = (BeautifulSoup(r.text, 'lxml') if has_lxml else BeautifulSoup(r.text)).html
channel = Tag(name = 'channel')
rss.append(channel)
channel.append(new_tag('title', list_html.head.title.string))
channel.append(new_tag('link', url))
channel.append(new_tag('description', '--'))
channel.append(new_tag('lastBuildDate', time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.gmtime())))
channel.append(new_tag('generator', 'RSS Builder'))
item_urls = list_html.select(list_selector)
for item_url in map(lambda i: i['href'], item_urls):
item_url = urllib.parse.urljoin(url, item_url)
parsed = urllib.parse.urlparse(item_url)
query_params = urllib.parse.parse_qsl(parsed.query)
item_url = urllib.parse.urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
'&'.join([ k+'='+v for k, v in query_params if k not in ignored_qp ]),
parsed.fragment))
r = requests.get(item_url, headers={"User-Agent": "RSS Builder"})
item_html = (BeautifulSoup(r.text, 'lxml') if has_lxml else BeautifulSoup(r.text)).html
item = Tag(name = 'item')
item.append(new_tag('title', item_html.head.title.string))
item.append(new_tag('link', item_url))
content = map(str, item_html.select(item_selector))
item.append(new_tag('description', "<br/>".join(content)))
channel.append(item)
out_func = lambda x: (x.prettify() if pretty else str(x))
if output == '-':
out_file = sys.stdout
close_file = lambda: None
else:
dirname = os.path.dirname(output)
if not os.path.exists(dirname):
os.makedirs(dirname)
out_file = open(output, 'w')
close_file = out_file.close
if has_lxml:
out_file.write(out_func(soup))
else:
out_file.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
out_file.write(out_func(rss))
out_file.write('\n')
close_file()
if __name__ == '__main__':
args = get_cmdline_args()
build_rss(args.url, args.list_selector, args.item_selector, args.ignored_query_params, args.output, args.pretty)