Esempio n. 1
0
def get_title_wavfile_standard(date_s, outputdir, avconv_exec, debugonly=False, npr_api_key=None):
    if npr_api_key is None:
        npr_api_key = npr_utils.get_api_key()

    # download this data into an lxml elementtree
    nprURL = npr_utils.get_NPR_URL(date_s, _npr_waitwait_progid, npr_api_key)
    decdate = npr_utils.get_decdate(date_s)
    tree = lxml.etree.fromstring(requests.get(nprURL).content)
    if debugonly:
        openfile = os.path.join(outputdir, "NPR.WaitWait.tree.%s.xml" % decdate)
        with open(openfile, "w") as outfile:
            outfile.write(lxml.etree.tostring(tree))
        return None

    # now get tuple of title to mp3 file
    title_mp3_urls = []
    for elem in filter(lambda elem: len(list(elem.iter("mp3"))) != 0, tree.iter("story")):
        title = list(elem.iter("title"))[0].text.strip()
        m3uurl = max(
            filter(lambda elm: "type" in elm.keys() and elm.get("type") == "m3u", elem.iter("mp3"))
        ).text.strip()
        try:
            mp3url = requests.get(m3uurl).content.strip()
            order = int(mp3url.split("_")[-1].replace(".mp3", ""))
            title_mp3_urls.append((title, mp3url, order))
        except Exception:
            pass

    titles, mp3urls, orders = zip(*sorted(title_mp3_urls, key=lambda tup: tup[2]))
    title = date_s.strftime("%B %d, %Y")
    title = "%s: %s." % (title, "; ".join(["%d) %s" % (num + 1, titl) for (num, titl) in enumerate(titles)]))
    outfiles = [
        os.path.join(outputdir, "waitwait.%s.%d.mp3" % (decdate, num + 1)) for (num, mp3url) in enumerate(mp3urls)
    ]

    # download those files
    time0 = time.time()
    pool = multiprocessing.Pool(processes=len(mp3urls))
    pool.map(_download_file, zip(mp3urls, outfiles))

    # sox magic command
    #    time0 = time.time()
    # wgdate = date_s.strftime('%d-%b-%Y')
    # wavfile = os.path.join(outputdir, 'waitwait%s.wav' % wgdate ).replace(' ', '\ ')
    # fnames = [ filename.replace(' ', '\ ') for filename in outfiles ]
    # split_cmd = [ '(for', 'file', 'in', ] + fnames + [
    #    ';', sox_exec, '$file', '-t', 'cdr', '-', ';', 'done)' ] + [
    #        '|', sox_exec, 't-', 'cdr', '-', wavfile ]
    # split_cmd = [ sox_exec, ] + fnames + [ wavfile, ]
    # sox_string_cmd = 'concat:%s' % '|'.join( fnames )
    # split_cmd = [ avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac', '2', '-threads',
    #              '%d' % multiprocessing.cpu_count(), wavfile ]
    # proc = subprocess.Popen(split_cmd, stdout = subprocess.PIPE,
    #                        stderr = subprocess.PIPE)
    # stdout_val, stderr_val = proc.communicate()
    # for filename in outfiles:
    #    os.remove(filename)
    return title, outfiles
Esempio n. 2
0
def rm_download_file( date_s, outdir = os.getcwd() ):
    decdate = npr_utils.get_decdate( date_s )
    outfile = os.path.join( outdir, 'NPR.WaitWait.%s.rm' % decdate )
    try:
        dsub = date_s.strftime('%Y%m%d')
        rm_url = 'http://download.npr.org/real.npr.na-central/waitwait/%s_waitwait.rm' % dsub
        req = urllib2.urlopen( rm_url )
        with open( outfile, 'w' ) as openfile:
            openfile.write( req.read() )
        return outfile
    except Exception as e:
        if os.path.isfile( outfile ):
            os.remove( outfile )
        raise ValueError("Error, could not download Wait Wait RM file for '%s' into %s." % (
            npr_utils.get_datestring(date_s), outdir ) )        
Esempio n. 3
0
def rm_download_file(date_s, outdir=os.getcwd()):
    decdate = npr_utils.get_decdate(date_s)
    outfile = os.path.join(outdir, 'NPR.WaitWait.%s.rm' % decdate)
    try:
        dsub = date_s.strftime('%Y%m%d')
        rm_url = 'http://download.npr.org/real.npr.na-central/waitwait/%s_waitwait.rm' % dsub
        req = urllib2.urlopen(rm_url)
        with open(outfile, 'w') as openfile:
            openfile.write(req.read())
        return outfile
    except Exception as e:
        if os.path.isfile(outfile):
            os.remove(outfile)
        raise ValueError(
            "Error, could not download Wait Wait RM file for '%s' into %s." %
            (npr_utils.get_datestring(date_s), outdir))
Esempio n. 4
0
def get_waitwait(outputdir, date_s, order_totnum=None, file_data=None, debugonly=False, exec_dict=None):

    # check if outputdir is a directory
    if not os.path.isdir(outputdir):
        raise ValueError("Error, %s is not a directory." % outputdir)

    # check if actually saturday
    if not npr_utils.is_saturday(date_s):
        raise ValueError("Error, date = %s not a Saturday." % npr_utils.get_datestring(date_s))

    if exec_dict is None:
        exec_dict = npr_utils.find_necessary_executables()
    assert exec_dict is not None
    avconv_exec = exec_dict["avconv"]

    if order_totnum is None:
        order_totnum = npr_utils.get_order_number_saturday_in_year(date_s)
    order_in_year, tot_in_year = order_totnum

    if file_data is None:
        file_data = get_waitwait_image()

    year = date_s.year
    decdate = npr_utils.get_decdate(date_s)
    m4afile = os.path.join(outputdir, "NPR.WaitWait.%s.m4a" % decdate)

    if year >= 2006:
        tup = get_title_wavfile_standard(date_s, outputdir, avconv_exec, debugonly=debugonly)
        if tup is None:
            return
        title, outfiles = tup
        fnames = [filename.replace(" ", "\ ") for filename in outfiles]
        sox_string_cmd = "concat:%s" % "|".join(fnames)
        split_cmd = [
            avconv_exec,
            "-y",
            "-i",
            sox_string_cmd,
            "-ar",
            "44100",
            "-ac",
            "2",
            "-threads",
            "%d" % multiprocessing.cpu_count(),
            "-strict",
            "experimental",
            "-acodec",
            "aac",
            m4afile,
        ]
        proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout_val, stderr_val = proc.communicate()
        for filename in outfiles:
            os.remove(filename)
    else:
        title = waitwait_realmedia.rm_get_title_from_url(date_s)
        rmfile = waitwait_realmedia.rm_download_file(date_s, outdir=outputdir)
        wavfile = waitwait_realmedia.rm_create_wav_file(date_s, rmfile, outdir=outputdir)
        os.remove(rmfile)

        # now convert to m4a file
        m4afile = os.path.join(outputdir, "NPR.WaitWait.%s.m4a" % decdate)
        split_cmd = [
            avconv_exec,
            "-y",
            "-i",
            wavfile,
            "-ar",
            "44100",
            "-ac",
            "2",
            "-threads",
            "%d" % multiprocessing.cpu_count(),
            "-strict",
            "experimental",
            "-acodec",
            "aac",
            m4afile,
        ]
        proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout_val, stderr_val = proc.communicate()

        # remove wav file
        os.remove(wavfile)

    # now put in metadata
    mp4tags = mutagen.mp4.MP4(m4afile)
    mp4tags.tags["\xa9nam"] = [title]
    mp4tags.tags["\xa9alb"] = ["Wait Wait...Don't Tell Me: %d" % year]
    mp4tags.tags["\xa9ART"] = ["Peter Sagal"]
    mp4tags.tags["\xa9day"] = ["%d" % year]
    mp4tags.tags["\xa9cmt"] = ["more info at : NPR Web site"]
    mp4tags.tags["trkn"] = [(order_in_year, tot_in_year)]
    mp4tags.tags["covr"] = [mutagen.mp4.MP4Cover(file_data, mutagen.mp4.MP4Cover.FORMAT_PNG)]
    mp4tags.tags["\xa9gen"] = ["Podcast"]
    mp4tags.save()
    return m4afile
Esempio n. 5
0
def get_waitwait(outputdir,
                 date_s,
                 order_totnum=None,
                 file_data=None,
                 debugonly=False,
                 exec_dict=None,
                 verify=True,
                 justFix=False):

    # check if outputdir is a directory
    if not os.path.isdir(outputdir):
        raise ValueError("Error, %s is not a directory." % outputdir)

    # check if actually saturday
    if not npr_utils.is_saturday(date_s):
        raise ValueError("Error, date = %s not a Saturday." %
                         npr_utils.get_datestring(date_s))

    if exec_dict is None:
        exec_dict = npr_utils.find_necessary_executables()
    assert (exec_dict is not None)
    avconv_exec = exec_dict['avconv']

    if order_totnum is None:
        order_totnum = npr_utils.get_order_number_saturday_in_year(date_s)
    order_in_year, tot_in_year = order_totnum

    if file_data is None:
        file_data = get_waitwait_image(verify=verify)

    year = date_s.year
    decdate = npr_utils.get_decdate(date_s)
    m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate)

    if year >= 2006:
        tup = get_title_wavfile_standard(date_s,
                                         outputdir,
                                         avconv_exec,
                                         debugonly=debugonly,
                                         verify=verify,
                                         justFix=justFix)
        if tup is None:
            return
        title, outfiles = tup
        if justFix:  # works only for year >= 2006
            if not os.path.isfile(m4afile):
                print "Error, %s does not exist." % os.path.basename(m4afile)
                return
            mp4tags = mutagen.mp4.MP4(m4afile)
            mp4tags.tags['\xa9nam'] = [
                title,
            ]
            mp4tags.save()
            logging.debug('fixed title for %s.' % m4afile)
            return m4afile
        fnames = map(lambda filename: filename.replace(' ', '\ '), outfiles)
        sox_string_cmd = 'concat:%s' % '|'.join(fnames)
        split_cmd = [
            avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac',
            '2', '-threads',
            '%d' % multiprocessing.cpu_count(), '-strict', 'experimental',
            '-acodec', 'aac', m4afile
        ]
        proc = subprocess.Popen(split_cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout_val, stderr_val = proc.communicate()
        if 'Protocol not found' in stderr_val.strip():
            for filename in outfiles:
                os.remove(filename)
            raise ValueError(
                "Error, AVCONV does not have the concatenation protocol.")
        for filename in outfiles:
            os.remove(filename)
    else:
        title = waitwait_realmedia.rm_get_title_from_url(date_s)
        rmfile = waitwait_realmedia.rm_download_file(date_s, outdir=outputdir)
        wavfile = waitwait_realmedia.rm_create_wav_file(date_s,
                                                        rmfile,
                                                        outdir=outputdir)
        os.remove(rmfile)

        # now convert to m4a file
        m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate)
        split_cmd = [
            avconv_exec, '-y', '-i', wavfile, '-ar', '44100', '-ac', '2',
            '-threads',
            '%d' % multiprocessing.cpu_count(), '-strict', 'experimental',
            '-acodec', 'aac', m4afile
        ]
        proc = subprocess.Popen(split_cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout_val, stderr_val = proc.communicate()

        # remove wav file
        os.remove(wavfile)

    # now put in metadata
    mp4tags = mutagen.mp4.MP4(m4afile)
    mp4tags.tags['\xa9nam'] = [
        title,
    ]
    mp4tags.tags['\xa9alb'] = [
        "Wait Wait...Don't Tell Me: %d" % year,
    ]
    mp4tags.tags['\xa9ART'] = [
        'Peter Sagal',
    ]
    mp4tags.tags['\xa9day'] = [
        '%d' % year,
    ]
    mp4tags.tags['\xa9cmt'] = [
        "more info at : NPR Web site",
    ]
    mp4tags.tags['trkn'] = [
        (order_in_year, tot_in_year),
    ]
    mp4tags.tags['covr'] = [
        mutagen.mp4.MP4Cover(file_data, mutagen.mp4.MP4Cover.FORMAT_PNG),
    ]
    mp4tags.tags['\xa9gen'] = [
        'Podcast',
    ]
    mp4tags.save()
    return m4afile
Esempio n. 6
0
def get_title_wavfile_standard(date_s,
                               outputdir,
                               avconv_exec,
                               debugonly=False,
                               npr_api_key=None,
                               verify=True,
                               justFix=False):
    if npr_api_key is None:
        npr_api_key = npr_utils.get_api_key()

    # download this data into an lxml elementtree
    decdate = npr_utils.get_decdate(date_s)
    response = requests.get('https://api.npr.org/query',
                            verify=verify,
                            params={
                                'date': date_s.strftime('%Y-%m-%d'),
                                'output': 'NPRML',
                                'apiKey': npr_api_key,
                                'dataType': 'story',
                                'id': _npr_waitwait_progid
                            })
    if response.status_code != 200:
        raise ValueError(
            "Error, could not get wait wait episode on %s. Error code is %d." %
            (date_s.strftime('%B %d, %Y'), response.status_code))
    html = BeautifulSoup(response.content, 'lxml')
    if debugonly:
        openfile = os.path.join(outputdir, 'NPR.WaitWait.%s.html' % decdate)
        with open(openfile, 'w') as outfile:
            outfile.write('%s\n' % html.prettify())
        return None

    def _get_title(title_URL):
        r2 = requests.get(title_URL)
        if r2.status_code != 200:
            return None
        h2 = BeautifulSoup(r2.content, 'lxml')
        title = titlecase.titlecase(
            max(h2.find_all('title')).text.split(':')[0].strip())
        return title

    # now get tuple of title to mp3 file
    title_mp3_urls = []
    for elem in filter(lambda elem: len(elem.find_all('mp3')) == 1,
                       html.find_all('story')):
        all_texts = filter(
            lambda line: len(line.strip()) != 0 and line.strip().startswith(
                'http:'), elem.text.split('\n'))
        title_URL = all_texts[0].strip()
        title = _get_title(title_URL)
        if title is None:
            continue
        m3uurl = max(
            filter(lambda elm: 'type' in elm.attrs and elm['type'] == 'm3u',
                   elem.find_all('mp3'))).get_text().strip()
        try:
            mp3url = requests.get(m3uurl).content.strip()
            order = int(mp3url.split('_')[-1].replace('.mp3', ''))
            title_mp3_urls.append((title, mp3url, order))
        except Exception:
            pass

    titles, mp3urls, orders = zip(
        *sorted(title_mp3_urls, key=lambda (title, mp3url, order): order))
    titles = list(titles)
    title = date_s.strftime('%B %d, %Y')
    title_elem_nmj = max(
        filter(
            lambda elem: len(elem.find_all('title')) == 1 and 'type' in elem.
            attrs and elem.attrs['type'] == 'programEpisode',
            html.find_all('parent')))
    title_text = filter(lambda line: len(line.strip()) != 0,
                        title_elem_nmj.text.split('\n'))[0]
    guest = re.sub('.*Guest', '', title_text).strip()
    title_guest_elems = filter(lambda (idx, titl): titl == 'Not My Job',
                               enumerate(titles))
    if len(title_guest_elems) != 0:
        idx_title_guest = max(title_guest_elems)[0]
        titles[idx_title_guest] = 'Not My Job: %s' % guest
    title = '%s: %s.' % (title, '; '.join(
        map(lambda (num, titl): '%d) %s' %
            (num + 1, titl), enumerate(titles))))
    outfiles = map(
        lambda
        (num, mp3url): os.path.join(outputdir, 'waitwait.%s.%d.mp3' %
                                    (decdate, num + 1)), enumerate(mp3urls))
    if not justFix:
        # download those files
        time0 = time.time()
        pool = multiprocessing.Pool(processes=len(mp3urls))
        pool.map(_download_file, zip(mp3urls, outfiles,
                                     len(mp3urls) * [verify]))
        logging.debug('downloaded %d mp3 files in %0.3f seconds.' %
                      (len(mp3urls), time.time() - time0))

    # sox magic command
    #    time0 = time.time()
    #wgdate = date_s.strftime('%d-%b-%Y')
    #wavfile = os.path.join(outputdir, 'waitwait%s.wav' % wgdate ).replace(' ', '\ ')
    #fnames = [ filename.replace(' ', '\ ') for filename in outfiles ]
    #split_cmd = [ '(for', 'file', 'in', ] + fnames + [
    #    ';', sox_exec, '$file', '-t', 'cdr', '-', ';', 'done)' ] + [
    #        '|', sox_exec, 't-', 'cdr', '-', wavfile ]
    # split_cmd = [ sox_exec, ] + fnames + [ wavfile, ]
    #sox_string_cmd = 'concat:%s' % '|'.join( fnames )
    #split_cmd = [ avconv_exec, '-y', '-i', sox_string_cmd, '-ar', '44100', '-ac', '2', '-threads',
    #              '%d' % multiprocessing.cpu_count(), wavfile ]
    #proc = subprocess.Popen(split_cmd, stdout = subprocess.PIPE,
    #                        stderr = subprocess.PIPE)
    #stdout_val, stderr_val = proc.communicate()
    #for filename in outfiles:
    #    os.remove(filename)
    return title, outfiles