forked from tanimislam/nprstuff
/
waitwait.py
executable file
·295 lines (267 loc) · 14.7 KB
/
waitwait.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python
import os, sys, glob, re, requests, multiprocessing
import subprocess, logging, datetime, time, titlecase
import npr_utils, mutagen.mp4, waitwait_realmedia
from optparse import OptionParser
from bs4 import BeautifulSoup
_npr_waitwait_progid = 35
def _get_last_saturday(datetime_s):
date_s = datetime_s.date( )
# first find today's date
tm_wday = date_s.weekday()
if tm_wday < 5:
tm_wday = tm_wday + 7
days_go_back = tm_wday - 5
date_sat = date_s - datetime.timedelta(days_go_back, 0, 0)
return date_sat
def get_waitwait_image( verify = True ):
return requests.get('https://upload.wikimedia.org/wikipedia/en/f/f4/WaitWait.png',
verify = verify ).content
def _download_file( input_tuple ):
mp3URL, filename, verify = input_tuple
with open(filename, 'wb') as openfile:
openfile.write( requests.get( mp3URL, verify = verify ).content )
def get_waitwait_date_from_name(candidateNPRWaitWaitFile):
if not os.path.isfile(candidateNPRWaitWaitFile):
raise ValueError("Error, %s is not a file," % candidateNPRWaitWaitFile )
if not os.path.basename(candidateNPRWaitWaitFile).endswith('.m4a'):
raise ValueError("Error, %s does not end in .m4a" % candidateNPRWaitWaitFile )
if not os.path.basename(candidateNPRWaitWaitFile).startswith('NPR.WaitWait.'):
raise ValueError("Error, %s is not a valid file" % candidateNPRWaitWaitFile )
day, mon, year = [ int(tok) for tok in os.path.basename(candidateNPRWaitWaitFile).split('.')[2:5] ]
return datetime.date(year, mon, day)
def get_waitwait_valid_dates_remaining_tuples(yearnum, inputdir):
waitwait_files_downloaded = glob.glob( os.path.join(inputdir, 'NPR.WaitWait.*.%04d.m4a' % yearnum ) )
dates_downloaded = set([ get_waitwait_date_from_name(filename) for filename in
waitwait_files_downloaded ])
all_order_saturdays = { date_s : (num+1) for (num, date_s) in
enumerate( npr_utils.get_saturday_times_in_year( yearnum ) ) }
dtime_now = datetime.datetime.now()
nowd = datetime.date(dtime_now.year, dtime_now.month, dtime_now.day)
saturdays_left = filter(lambda date_s: date_s < nowd, set( all_order_saturdays.keys() ) -
set( dates_downloaded ) )
totnum = len( all_order_saturdays.keys() )
order_dates_remain = sorted([ ( all_order_saturdays[date_s], totnum, date_s ) for
date_s in saturdays_left ], key = lambda tup: tup[0] )
return order_dates_remain
def _process_waitwaits_by_year_tuple(input_tuple):
outputdir, totnum, verbose, datetimes_order_tuples = input_tuple
ww_image = get_waitwait_image()
for date_s, order in datetimes_order_tuples:
time0 = time.time()
try:
fname = get_waitwait(outputdir, date_s, order_totnum = ( order, totnum),
file_data = ww_image)
if verbose:
print('Processed %s in %0.3f seconds.' % ( fname, time.time() - time0 ))
except Exception as e:
print('Could not create Wait Wait episode for date %s for some reason.' % (
npr_utils.get_datestring( date_s ) ) )
def get_all_waitwaits_year( yearnum,
inputdir, verbose = True):
order_dates_remain = get_waitwait_valid_dates_remaining_tuples( yearnum, inputdir )
if len( order_dates_remain ) == 0: return
totnum = order_dates_remain[0][1]
nprocs = multiprocessing.cpu_count()
input_tuples = [ ( inputdir, totnum, verbose, [ ( date_s, order) for ( order, totnum, date_s ) in
order_dates_remain if ( order - 1 ) % nprocs == procno ] ) for
procno in xrange( nprocs ) ]
time0 = time.time()
pool = npr_utils.MyPool(processes = nprocs )
pool.map(_process_waitwaits_by_year_tuple, input_tuples)
if verbose:
print('processed all Wait Wait downloads for %04d in %0.3f seconds.' % ( yearnum, time.time() - time0 ) )
def get_title_wavfile_standard(date_s, outputdir, avconv_exec,
debugonly = False, npr_api_key = None,
verify = True, justFix = False ):
if npr_api_key is None:
npr_api_key = npr_utils.get_api_key()
# download this data into an lxml elementtree
decdate = npr_utils.get_decdate( date_s )
response = requests.get( 'https://api.npr.org/query', verify = verify,
params = {
'date' : date_s.strftime('%Y-%m-%d'),
'output' : 'NPRML',
'apiKey' : npr_api_key,
'dataType' : 'story',
'id' : _npr_waitwait_progid })
if response.status_code != 200:
raise ValueError("Error, could not get wait wait episode on %s. Error code is %d." %
( date_s.strftime('%B %d, %Y'), response.status_code ) )
html = BeautifulSoup( response.content, 'lxml' )
if debugonly:
openfile = os.path.join( outputdir, 'NPR.WaitWait.%s.html' %
decdate )
with open( openfile, 'w') as outfile:
outfile.write( '%s\n' % html.prettify( ) )
return None
def _get_title( title_URL ):
r2 = requests.get( title_URL )
if r2.status_code != 200:
return None
h2 = BeautifulSoup( r2.content, 'lxml' )
title = titlecase.titlecase( max( h2.find_all('title') ).text.split(':')[0].strip( ) )
return title
# now get tuple of title to mp3 file
title_mp3_urls = []
for elem in filter(lambda elem: len( elem.find_all('mp3')) == 1, html.find_all('story')):
all_texts = filter(lambda line: len(line.strip()) != 0 and line.strip().startswith('http:'),
elem.text.split('\n'))
title_URL = all_texts[0].strip( )
title = _get_title( title_URL )
if title is None:
continue
m3uurl = max( filter(lambda elm: 'type' in elm.attrs and
elm['type'] == 'm3u', elem.find_all('mp3') ) ).get_text( ).strip( )
try:
mp3url = requests.get( m3uurl ).content.strip( )
order = int( mp3url.split('_')[-1].replace('.mp3', '') )
title_mp3_urls.append( ( title, mp3url, order ) )
except Exception:
pass
titles, mp3urls, orders = zip(*sorted(title_mp3_urls,
key = lambda (title, mp3url, order): order))
titles = list( titles )
title = date_s.strftime('%B %d, %Y')
title_elem_nmj = max(filter(lambda elem: len( elem.find_all('title')) == 1 and
'type' in elem.attrs and elem.attrs['type'] == 'programEpisode',
html.find_all('parent')))
title_text = filter(lambda line: len(line.strip()) != 0, title_elem_nmj.text.split('\n'))[0]
guest = re.sub('.*Guest', '', title_text ).strip( )
title_guest_elems = filter(lambda (idx, titl): titl == 'Not My Job', enumerate(titles))
if len( title_guest_elems ) != 0:
idx_title_guest = max(title_guest_elems)[0]
titles[ idx_title_guest ] = 'Not My Job: %s' % guest
title = '%s: %s.' % ( title,
'; '.join(map(lambda (num, titl): '%d) %s' % ( num + 1, titl ),
enumerate(titles))))
outfiles = map(lambda (num, mp3url): os.path.join(outputdir, 'waitwait.%s.%d.mp3' %
( decdate, num + 1) ),
enumerate( mp3urls ) )
if not justFix:
# download those files
time0 = time.time()
pool = multiprocessing.Pool(processes = len(mp3urls) )
pool.map(_download_file, zip( mp3urls, outfiles, len( mp3urls ) * [ verify ] ) )
logging.debug( 'downloaded %d mp3 files in %0.3f seconds.' % ( len( mp3urls ),
time.time( ) - time0 ) )
# sox magic command
# time0 = time.time()
#wgdate = date_s.strftime('%d-%b-%Y')
#wavfile = os.path.join(outputdir, 'waitwait%s.wav' % wgdate ).replace(' ', '\ ')
#fnames = [ filename.replace(' ', '\ ') for filename in outfiles ]
#split_cmd = [ '(for', 'file', 'in', ] + fnames + [
# ';', sox_exec, '$file', '-t', 'cdr', '-', ';', 'done)' ] + [
# '|', sox_exec, 't-', 'cdr', '-', wavfile ]
# split_cmd = [ sox_exec, ] + fnames + [ wavfile, ]
#sox_string_cmd = 'concat:%s' % '|'.join( fnames )
#split_cmd = [ avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac', '2', '-threads',
# '%d' % multiprocessing.cpu_count(), wavfile ]
#proc = subprocess.Popen(split_cmd, stdout = subprocess.PIPE,
# stderr = subprocess.PIPE)
#stdout_val, stderr_val = proc.communicate()
#for filename in outfiles:
# os.remove(filename)
return title, outfiles
def get_waitwait(outputdir, date_s, order_totnum = None,
file_data = None, debugonly = False,
exec_dict = None, verify = True, justFix = False ):
# check if outputdir is a directory
if not os.path.isdir(outputdir):
raise ValueError("Error, %s is not a directory." % outputdir)
# check if actually saturday
if not npr_utils.is_saturday(date_s):
raise ValueError("Error, date = %s not a Saturday." %
npr_utils.get_datestring(date_s) )
if exec_dict is None:
exec_dict = npr_utils.find_necessary_executables()
assert( exec_dict is not None )
avconv_exec = exec_dict['avconv']
if order_totnum is None:
order_totnum = npr_utils.get_order_number_saturday_in_year(date_s)
order_in_year, tot_in_year = order_totnum
if file_data is None:
file_data = get_waitwait_image( verify = verify )
year = date_s.year
decdate = npr_utils.get_decdate( date_s )
m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate )
if year >= 2006:
tup = get_title_wavfile_standard(date_s, outputdir, avconv_exec,
debugonly = debugonly,
verify = verify, justFix = justFix )
if tup is None:
return
title, outfiles = tup
if justFix: # works only for year >= 2006
if not os.path.isfile( m4afile ):
print "Error, %s does not exist." % os.path.basename( m4afile )
return
mp4tags = mutagen.mp4.MP4(m4afile)
mp4tags.tags['\xa9nam'] = [ title, ]
mp4tags.save( )
logging.debug('fixed title for %s.' % m4afile )
return m4afile
fnames = map(lambda filename: filename.replace(' ', '\ '), outfiles)
sox_string_cmd = 'concat:%s' % '|'.join( fnames )
split_cmd = [ avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac', '2', '-threads',
'%d' % multiprocessing.cpu_count(), '-strict', 'experimental', '-acodec', 'aac',
m4afile ]
proc = subprocess.Popen(split_cmd, stdout = subprocess.PIPE,
stderr = subprocess.PIPE)
stdout_val, stderr_val = proc.communicate()
if 'Protocol not found' in stderr_val.strip( ):
for filename in outfiles:
os.remove( filename )
raise ValueError("Error, AVCONV does not have the concatenation protocol.")
for filename in outfiles:
os.remove( filename )
else:
title = waitwait_realmedia.rm_get_title_from_url( date_s )
rmfile = waitwait_realmedia.rm_download_file( date_s,
outdir = outputdir )
wavfile = waitwait_realmedia.rm_create_wav_file( date_s, rmfile,
outdir = outputdir )
os.remove( rmfile )
# now convert to m4a file
m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate )
split_cmd = [ avconv_exec, '-y', '-i', wavfile, '-ar', '44100',
'-ac', '2', '-threads', '%d' % multiprocessing.cpu_count(),
'-strict', 'experimental', '-acodec', 'aac', m4afile ]
proc = subprocess.Popen(split_cmd, stdout = subprocess.PIPE,
stderr = subprocess.PIPE)
stdout_val, stderr_val = proc.communicate()
# remove wav file
os.remove( wavfile )
# now put in metadata
mp4tags = mutagen.mp4.MP4(m4afile)
mp4tags.tags['\xa9nam'] = [ title, ]
mp4tags.tags['\xa9alb'] = [ "Wait Wait...Don't Tell Me: %d" % year, ]
mp4tags.tags['\xa9ART'] = [ 'Peter Sagal', ]
mp4tags.tags['\xa9day'] = [ '%d' % year, ]
mp4tags.tags['\xa9cmt'] = [ "more info at : NPR Web site", ]
mp4tags.tags['trkn'] = [ ( order_in_year, tot_in_year ), ]
mp4tags.tags['covr'] = [ mutagen.mp4.MP4Cover(file_data, mutagen.mp4.MP4Cover.FORMAT_PNG ), ]
mp4tags.tags['\xa9gen'] = [ 'Podcast', ]
mp4tags.save()
return m4afile
if __name__=='__main__':
parser = OptionParser()
parser.add_option('--dirname', dest='dirname', type=str,
action = 'store', default = '/mnt/media/waitwait',
help = 'Name of the directory to store the file. Default is %s.' %
'/mnt/media/waitwait')
parser.add_option('--date', dest='date', type=str,
action = 'store', default = npr_utils.get_datestring(_get_last_saturday( datetime.datetime.now())),
help = 'The date, in the form of "January 1, 2014." The default is last Saturday, %s.' %
npr_utils.get_datestring( _get_last_saturday( datetime.datetime.now() ) ) )
parser.add_option('--debugonly', dest='debugonly', action='store_true', default = False,
help = 'If chosen, download the NPR XML data sheet for this Wait Wait episode.')
parser.add_option('--noverify', dest='do_noverify', action='store_true', default = False,
help = 'If chosen, Do not verify the SSL connection.')
parser.add_option('--justfix', dest='do_justfix', action='store_true', default = False,
help = "If chosen, just fix the title of an existing NPR Wait Wait episode's file.")
opts, args = parser.parse_args()
if opts.debugonly:
logging.basicConfig( level = logging.DEBUG )
fname = get_waitwait( opts.dirname, npr_utils.get_time_from_datestring( opts.date ), debugonly = opts.debugonly,
verify = not opts.do_noverify, justFix = opts.do_justfix )