forked from iffy/generalconference
/
download.py
executable file
·288 lines (240 loc) · 8.65 KB
/
download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import requests
import hashlib
import sys
import yaml
import copy
from urlparse import urlparse
from filepath import FilePath
from lxml import etree
from lxml.html import soupparser
def log(*args):
sys.stderr.write(' '.join(map(unicode, args)) + '\n')
def writeIfDifferent(fp, content):
if not fp.exists() or fp.getContent() != content:
fp.setContent(content)
fp.chmod(0664)
log('wrote {path} ({size} bytes)'.format(size=len(content), path=fp.path))
def mergeYAML(fp, data):
merged_data = copy.deepcopy(data)
if fp.exists():
existing_data = yaml.safe_load(fp.open('rb'))
existing_data.update(merged_data)
merged_data = existing_data
writeIfDifferent(fp, yaml.safe_dump(merged_data, default_flow_style=False))
class CachingDownloader(object):
cache_dir = FilePath('.cache')
def getURL(self, url):
"""
Fetch a web page either from the local cache or else
from the Internet.
"""
global cache_dir
if not self.cache_dir.exists():
self.cache_dir.makedirs()
cache_key = hashlib.sha1(url).hexdigest()
cache_file = self.cache_dir.child(cache_key)
if not cache_file.exists():
log('GET', url)
r = requests.get(url)
cache_file.setContent(r.content)
cache_file.chmod(0664)
else:
log('[CACHE] GET', url)
return cache_file.getContent()
downloader = CachingDownloader()
getURL = downloader.getURL
def conferenceURL(year, month, lang):
root = 'https://www.lds.org/general-conference/sessions'
url = '{root}/{year}/{month:02d}?lang={lang}'.format(**locals())
return url
def makeCounter():
i = 0
while True:
yield i
i += 1
def getTalkURLs(data_dir, year, month, lang):
"""
Return a generator of talk metadata available from the index.
"""
url = conferenceURL(year, month, lang)
html = getURL(url)
parsed = soupparser.fromstring(html)
sessions = parsed.xpath('//table[@class="sessions"]')
counter = makeCounter()
for session_num, session in enumerate(sessions):
session_id = session.attrib.get('id', None)
if not session_id:
continue
session_title = session.xpath('.//tr[@class="head-row"]//h2')[0].text
rows = session.xpath('tr')
for row in rows:
talk = row.xpath(".//span[@class='talk']")
song = row.xpath(".//span[@class='song']")
if not talk and not song:
continue
item_number = '{0:03d}'.format(counter.next())
if song:
# skipping for now
continue
elif talk:
talk = talk[0]
talk_a = talk.xpath('.//a')
if not talk_a:
# probably no translation yet.
continue
talk_a = talk_a[0]
speaker = row.xpath(".//span[@class='speaker']")[0]
url = talk_a.attrib['href']
slug = urlparse(url).path.split('/')[-1]
file_slug = '{item}-{slug}'.format(item=item_number,
slug=slug)
yield {
'session_id': session_id,
'session_title': session_title,
'item': item_number,
'speaker': speaker.text,
'url': url,
'title': talk_a.text,
'slug': slug,
'key': file_slug,
'year': int(year),
'month': int(month),
}
def getSingleConference(data_dir, year, month, lang):
"""
Download and store data for a single general conference.
"""
talk_urls = getTalkURLs(data_dir, year, month, lang)
conf_path = data_dir.child(lang).child('{year}-{month:02d}'.format(**locals()))
if not conf_path.exists():
conf_path.makedirs()
index = []
for meta in talk_urls:
index.append(meta)
html = getURL(meta['url'])
fp = conf_path.child(meta['key'])
if not fp.exists():
fp.makedirs()
# text.md
markdown = extractTalkAsMarkdown(html, meta)
writeIfDifferent(fp.child('text.md'), markdown)
# metadata.yml
mergeYAML(fp.child('metadata.yml'), meta)
index_file = conf_path.child('index.yml')
index_data = {
'year': year,
'month': int(month),
'items': index,
}
writeIfDifferent(index_file,
yaml.safe_dump(index_data, default_flow_style=False))
def extractTalkAsMarkdown(html, metadata):
"""
Extract the talk as stripped-down HTML.
"""
parsed = soupparser.fromstring(html)
primary = parsed.get_element_by_id('primary')
metadata['article_id'] = primary.get_element_by_id('article-id').text
todelete = [
'//div[@id="video-player"]',
'//div[@id="audio-player"]',
'//span[@id="article-id"]',
'//li[@class="prev"]',
'//li[@class="next"]',
]
for x in todelete:
elems = primary.xpath(x)
for elem in elems:
elem.getparent().remove(elem)
# Remove the blockquote
kicker = primary.xpath('.//blockquote[@class="intro dontHighlight"]')
for k in kicker:
k.getparent().remove(k)
# remove no-link-style links
for link in primary.xpath('.//a[@class="no-link-style"]'):
link.drop_tag()
# fix lists
list_items = primary.xpath('.//ul[@class="bullet"]/li')
list_items += primary.xpath('.//ol/li')
for li in list_items:
label = li.xpath('.//span[@class="label"]')[0]
label.text = ''
label.drop_tag()
for anchor in li.xpath('.//a[@name]'):
anchor.getparent().remove(anchor)
# replace citations
for citation in primary.xpath('.//sup[@class="noteMarker"]/a'):
citation.text = '[' + citation.text + ']'
citation.drop_tag()
import html2text
h = html2text.HTML2Text()
h.ignore_images = True
h.reference_links = True
markdown = h.handle(etree.tostring(primary))
markdown = markdown.encode('utf-8')
# replace some common fancy chars and other things
replacements = {
u'…'.encode('utf-8'): '...',
'\xc2\xa0': ' ',
'#### Show References': '## References',
}
for k,v in replacements.items():
markdown = markdown.replace(k, v)
# get main title
title = primary.xpath('//h1')[0].text.strip()
markdown = '# ' + title.encode('utf-8') + '\n\n' + markdown
return markdown
def listLanguages(year, month):
url = conferenceURL(year, month, 'eng')
html = getURL(url)
parsed = soupparser.fromstring(html)
for option in parsed.xpath('//div[@id="clang-selector"]/select/option'):
value = option.attrib['value']
if not value:
continue
code = value[-3:] # how stable do think this magic number will be? :)
yield code, option.text
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument('--quiet', '-q',
action='store_true',
help='If supplied, logging information will be suppressed.')
ap.add_argument('--cache-dir', '-c',
default='.cache',
type=FilePath,
help='Directory where downloaded files are cached '
'(to preserve bandwidth). Default: %(default)s')
ap.add_argument('--data-dir', '-D',
default="data",
type=FilePath,
help='Root directory of where you store the data.'
' Default: %(default)s')
ap.add_argument('--lang', '-L',
default='eng',
help='Language to fetch. Currently, only English is tested'
' but other languages might work, too. Use the clang'
' or lang value in the URL of lds.org for the language'
' you want. Default: %(default)s')
ap.add_argument('--list-langs',
action='store_true',
help='Instead of download stuff, just list the available languages'
' for this conference.')
ap.add_argument('year', type=int,
help="Year of the conference (e.g. 2015)")
ap.add_argument('month', type=int,
help="Month of the conference (e.g. 10 or 4)")
args = ap.parse_args()
if args.quiet:
log = lambda *a:None ; # NOQA
downloader.cache_dir = args.cache_dir
if args.list_langs:
for code, name in listLanguages(args.year, args.month):
print code, name
else:
if not args.data_dir.exists():
args.data_dir.makedirs()
getSingleConference(args.data_dir, args.year, args.month,
args.lang)