def get_title_wavfile_standard(date_s, outputdir, avconv_exec, debugonly=False, npr_api_key=None): if npr_api_key is None: npr_api_key = npr_utils.get_api_key() # download this data into an lxml elementtree nprURL = npr_utils.get_NPR_URL(date_s, _npr_waitwait_progid, npr_api_key) decdate = npr_utils.get_decdate(date_s) tree = lxml.etree.fromstring(requests.get(nprURL).content) if debugonly: openfile = os.path.join(outputdir, "NPR.WaitWait.tree.%s.xml" % decdate) with open(openfile, "w") as outfile: outfile.write(lxml.etree.tostring(tree)) return None # now get tuple of title to mp3 file title_mp3_urls = [] for elem in filter(lambda elem: len(list(elem.iter("mp3"))) != 0, tree.iter("story")): title = list(elem.iter("title"))[0].text.strip() m3uurl = max( filter(lambda elm: "type" in elm.keys() and elm.get("type") == "m3u", elem.iter("mp3")) ).text.strip() try: mp3url = requests.get(m3uurl).content.strip() order = int(mp3url.split("_")[-1].replace(".mp3", "")) title_mp3_urls.append((title, mp3url, order)) except Exception: pass titles, mp3urls, orders = zip(*sorted(title_mp3_urls, key=lambda tup: tup[2])) title = date_s.strftime("%B %d, %Y") title = "%s: %s." % (title, "; ".join(["%d) %s" % (num + 1, titl) for (num, titl) in enumerate(titles)])) outfiles = [ os.path.join(outputdir, "waitwait.%s.%d.mp3" % (decdate, num + 1)) for (num, mp3url) in enumerate(mp3urls) ] # download those files time0 = time.time() pool = multiprocessing.Pool(processes=len(mp3urls)) pool.map(_download_file, zip(mp3urls, outfiles)) # sox magic command # time0 = time.time() # wgdate = date_s.strftime('%d-%b-%Y') # wavfile = os.path.join(outputdir, 'waitwait%s.wav' % wgdate ).replace(' ', '\ ') # fnames = [ filename.replace(' ', '\ ') for filename in outfiles ] # split_cmd = [ '(for', 'file', 'in', ] + fnames + [ # ';', sox_exec, '$file', '-t', 'cdr', '-', ';', 'done)' ] + [ # '|', sox_exec, 't-', 'cdr', '-', wavfile ] # split_cmd = [ sox_exec, ] + fnames + [ wavfile, ] # sox_string_cmd = 'concat:%s' % '|'.join( fnames ) # split_cmd = [ avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac', '2', '-threads', # '%d' % multiprocessing.cpu_count(), wavfile ] # proc = subprocess.Popen(split_cmd, stdout = subprocess.PIPE, # stderr = subprocess.PIPE) # stdout_val, stderr_val = proc.communicate() # for filename in outfiles: # os.remove(filename) return title, outfiles
def rm_download_file( date_s, outdir = os.getcwd() ): decdate = npr_utils.get_decdate( date_s ) outfile = os.path.join( outdir, 'NPR.WaitWait.%s.rm' % decdate ) try: dsub = date_s.strftime('%Y%m%d') rm_url = 'http://download.npr.org/real.npr.na-central/waitwait/%s_waitwait.rm' % dsub req = urllib2.urlopen( rm_url ) with open( outfile, 'w' ) as openfile: openfile.write( req.read() ) return outfile except Exception as e: if os.path.isfile( outfile ): os.remove( outfile ) raise ValueError("Error, could not download Wait Wait RM file for '%s' into %s." % ( npr_utils.get_datestring(date_s), outdir ) )
def rm_download_file(date_s, outdir=os.getcwd()): decdate = npr_utils.get_decdate(date_s) outfile = os.path.join(outdir, 'NPR.WaitWait.%s.rm' % decdate) try: dsub = date_s.strftime('%Y%m%d') rm_url = 'http://download.npr.org/real.npr.na-central/waitwait/%s_waitwait.rm' % dsub req = urllib2.urlopen(rm_url) with open(outfile, 'w') as openfile: openfile.write(req.read()) return outfile except Exception as e: if os.path.isfile(outfile): os.remove(outfile) raise ValueError( "Error, could not download Wait Wait RM file for '%s' into %s." % (npr_utils.get_datestring(date_s), outdir))
def get_waitwait(outputdir, date_s, order_totnum=None, file_data=None, debugonly=False, exec_dict=None): # check if outputdir is a directory if not os.path.isdir(outputdir): raise ValueError("Error, %s is not a directory." % outputdir) # check if actually saturday if not npr_utils.is_saturday(date_s): raise ValueError("Error, date = %s not a Saturday." % npr_utils.get_datestring(date_s)) if exec_dict is None: exec_dict = npr_utils.find_necessary_executables() assert exec_dict is not None avconv_exec = exec_dict["avconv"] if order_totnum is None: order_totnum = npr_utils.get_order_number_saturday_in_year(date_s) order_in_year, tot_in_year = order_totnum if file_data is None: file_data = get_waitwait_image() year = date_s.year decdate = npr_utils.get_decdate(date_s) m4afile = os.path.join(outputdir, "NPR.WaitWait.%s.m4a" % decdate) if year >= 2006: tup = get_title_wavfile_standard(date_s, outputdir, avconv_exec, debugonly=debugonly) if tup is None: return title, outfiles = tup fnames = [filename.replace(" ", "\ ") for filename in outfiles] sox_string_cmd = "concat:%s" % "|".join(fnames) split_cmd = [ avconv_exec, "-y", "-i", sox_string_cmd, "-ar", "44100", "-ac", "2", "-threads", "%d" % multiprocessing.cpu_count(), "-strict", "experimental", "-acodec", "aac", m4afile, ] proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = proc.communicate() for filename in outfiles: os.remove(filename) else: title = waitwait_realmedia.rm_get_title_from_url(date_s) rmfile = waitwait_realmedia.rm_download_file(date_s, outdir=outputdir) wavfile = waitwait_realmedia.rm_create_wav_file(date_s, rmfile, outdir=outputdir) os.remove(rmfile) # now convert to m4a file m4afile = os.path.join(outputdir, "NPR.WaitWait.%s.m4a" % decdate) split_cmd = [ avconv_exec, "-y", "-i", wavfile, "-ar", "44100", "-ac", "2", "-threads", "%d" % multiprocessing.cpu_count(), "-strict", "experimental", "-acodec", "aac", m4afile, ] proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = proc.communicate() # remove wav file os.remove(wavfile) # now put in metadata mp4tags = mutagen.mp4.MP4(m4afile) mp4tags.tags["\xa9nam"] = [title] mp4tags.tags["\xa9alb"] = ["Wait Wait...Don't Tell Me: %d" % year] mp4tags.tags["\xa9ART"] = ["Peter Sagal"] mp4tags.tags["\xa9day"] = ["%d" % year] mp4tags.tags["\xa9cmt"] = ["more info at : NPR Web site"] mp4tags.tags["trkn"] = [(order_in_year, tot_in_year)] mp4tags.tags["covr"] = [mutagen.mp4.MP4Cover(file_data, mutagen.mp4.MP4Cover.FORMAT_PNG)] mp4tags.tags["\xa9gen"] = ["Podcast"] mp4tags.save() return m4afile
def get_waitwait(outputdir, date_s, order_totnum=None, file_data=None, debugonly=False, exec_dict=None, verify=True, justFix=False): # check if outputdir is a directory if not os.path.isdir(outputdir): raise ValueError("Error, %s is not a directory." % outputdir) # check if actually saturday if not npr_utils.is_saturday(date_s): raise ValueError("Error, date = %s not a Saturday." % npr_utils.get_datestring(date_s)) if exec_dict is None: exec_dict = npr_utils.find_necessary_executables() assert (exec_dict is not None) avconv_exec = exec_dict['avconv'] if order_totnum is None: order_totnum = npr_utils.get_order_number_saturday_in_year(date_s) order_in_year, tot_in_year = order_totnum if file_data is None: file_data = get_waitwait_image(verify=verify) year = date_s.year decdate = npr_utils.get_decdate(date_s) m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate) if year >= 2006: tup = get_title_wavfile_standard(date_s, outputdir, avconv_exec, debugonly=debugonly, verify=verify, justFix=justFix) if tup is None: return title, outfiles = tup if justFix: # works only for year >= 2006 if not os.path.isfile(m4afile): print "Error, %s does not exist." % os.path.basename(m4afile) return mp4tags = mutagen.mp4.MP4(m4afile) mp4tags.tags['\xa9nam'] = [ title, ] mp4tags.save() logging.debug('fixed title for %s.' % m4afile) return m4afile fnames = map(lambda filename: filename.replace(' ', '\ '), outfiles) sox_string_cmd = 'concat:%s' % '|'.join(fnames) split_cmd = [ avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac', '2', '-threads', '%d' % multiprocessing.cpu_count(), '-strict', 'experimental', '-acodec', 'aac', m4afile ] proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = proc.communicate() if 'Protocol not found' in stderr_val.strip(): for filename in outfiles: os.remove(filename) raise ValueError( "Error, AVCONV does not have the concatenation protocol.") for filename in outfiles: os.remove(filename) else: title = waitwait_realmedia.rm_get_title_from_url(date_s) rmfile = waitwait_realmedia.rm_download_file(date_s, outdir=outputdir) wavfile = waitwait_realmedia.rm_create_wav_file(date_s, rmfile, outdir=outputdir) os.remove(rmfile) # now convert to m4a file m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate) split_cmd = [ avconv_exec, '-y', '-i', wavfile, '-ar', '44100', '-ac', '2', '-threads', '%d' % multiprocessing.cpu_count(), '-strict', 'experimental', '-acodec', 'aac', m4afile ] proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = proc.communicate() # remove wav file os.remove(wavfile) # now put in metadata mp4tags = mutagen.mp4.MP4(m4afile) mp4tags.tags['\xa9nam'] = [ title, ] mp4tags.tags['\xa9alb'] = [ "Wait Wait...Don't Tell Me: %d" % year, ] mp4tags.tags['\xa9ART'] = [ 'Peter Sagal', ] mp4tags.tags['\xa9day'] = [ '%d' % year, ] mp4tags.tags['\xa9cmt'] = [ "more info at : NPR Web site", ] mp4tags.tags['trkn'] = [ (order_in_year, tot_in_year), ] mp4tags.tags['covr'] = [ mutagen.mp4.MP4Cover(file_data, mutagen.mp4.MP4Cover.FORMAT_PNG), ] mp4tags.tags['\xa9gen'] = [ 'Podcast', ] mp4tags.save() return m4afile
def get_title_wavfile_standard(date_s, outputdir, avconv_exec, debugonly=False, npr_api_key=None, verify=True, justFix=False): if npr_api_key is None: npr_api_key = npr_utils.get_api_key() # download this data into an lxml elementtree decdate = npr_utils.get_decdate(date_s) response = requests.get('https://api.npr.org/query', verify=verify, params={ 'date': date_s.strftime('%Y-%m-%d'), 'output': 'NPRML', 'apiKey': npr_api_key, 'dataType': 'story', 'id': _npr_waitwait_progid }) if response.status_code != 200: raise ValueError( "Error, could not get wait wait episode on %s. Error code is %d." % (date_s.strftime('%B %d, %Y'), response.status_code)) html = BeautifulSoup(response.content, 'lxml') if debugonly: openfile = os.path.join(outputdir, 'NPR.WaitWait.%s.html' % decdate) with open(openfile, 'w') as outfile: outfile.write('%s\n' % html.prettify()) return None def _get_title(title_URL): r2 = requests.get(title_URL) if r2.status_code != 200: return None h2 = BeautifulSoup(r2.content, 'lxml') title = titlecase.titlecase( max(h2.find_all('title')).text.split(':')[0].strip()) return title # now get tuple of title to mp3 file title_mp3_urls = [] for elem in filter(lambda elem: len(elem.find_all('mp3')) == 1, html.find_all('story')): all_texts = filter( lambda line: len(line.strip()) != 0 and line.strip().startswith( 'http:'), elem.text.split('\n')) title_URL = all_texts[0].strip() title = _get_title(title_URL) if title is None: continue m3uurl = max( filter(lambda elm: 'type' in elm.attrs and elm['type'] == 'm3u', elem.find_all('mp3'))).get_text().strip() try: mp3url = requests.get(m3uurl).content.strip() order = int(mp3url.split('_')[-1].replace('.mp3', '')) title_mp3_urls.append((title, mp3url, order)) except Exception: pass titles, mp3urls, orders = zip( *sorted(title_mp3_urls, key=lambda (title, mp3url, order): order)) titles = list(titles) title = date_s.strftime('%B %d, %Y') title_elem_nmj = max( filter( lambda elem: len(elem.find_all('title')) == 1 and 'type' in elem. attrs and elem.attrs['type'] == 'programEpisode', html.find_all('parent'))) title_text = filter(lambda line: len(line.strip()) != 0, title_elem_nmj.text.split('\n'))[0] guest = re.sub('.*Guest', '', title_text).strip() title_guest_elems = filter(lambda (idx, titl): titl == 'Not My Job', enumerate(titles)) if len(title_guest_elems) != 0: idx_title_guest = max(title_guest_elems)[0] titles[idx_title_guest] = 'Not My Job: %s' % guest title = '%s: %s.' % (title, '; '.join( map(lambda (num, titl): '%d) %s' % (num + 1, titl), enumerate(titles)))) outfiles = map( lambda (num, mp3url): os.path.join(outputdir, 'waitwait.%s.%d.mp3' % (decdate, num + 1)), enumerate(mp3urls)) if not justFix: # download those files time0 = time.time() pool = multiprocessing.Pool(processes=len(mp3urls)) pool.map(_download_file, zip(mp3urls, outfiles, len(mp3urls) * [verify])) logging.debug('downloaded %d mp3 files in %0.3f seconds.' % (len(mp3urls), time.time() - time0)) # sox magic command # time0 = time.time() #wgdate = date_s.strftime('%d-%b-%Y') #wavfile = os.path.join(outputdir, 'waitwait%s.wav' % wgdate ).replace(' ', '\ ') #fnames = [ filename.replace(' ', '\ ') for filename in outfiles ] #split_cmd = [ '(for', 'file', 'in', ] + fnames + [ # ';', sox_exec, '$file', '-t', 'cdr', '-', ';', 'done)' ] + [ # '|', sox_exec, 't-', 'cdr', '-', wavfile ] # split_cmd = [ sox_exec, ] + fnames + [ wavfile, ] #sox_string_cmd = 'concat:%s' % '|'.join( fnames ) #split_cmd = [ avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac', '2', '-threads', # '%d' % multiprocessing.cpu_count(), wavfile ] #proc = subprocess.Popen(split_cmd, stdout = subprocess.PIPE, # stderr = subprocess.PIPE) #stdout_val, stderr_val = proc.communicate() #for filename in outfiles: # os.remove(filename) return title, outfiles