Example #1
0
def _waitwait( ):
    parser = ArgumentParser( )
    parser.add_argument('--dirname', dest='dirname', type=str,
                        action = 'store', default = _default_inputdir,
                        help = 'Name of the directory to store the file. Default is %s.' %
                        _default_inputdir )
    parser.add_argument('--date', dest='date', type=str,
                        action = 'store', default =
                        get_datestring(_get_last_saturday( datetime.datetime.now())),
                        help = 'The date, in the form of "January 1, 2014." The default is last Saturday, %s.' %
                        get_datestring( _get_last_saturday( datetime.datetime.now() ) ) )
    parser.add_argument('--dump', dest='do_dump', action='store_true', default = False,
                        help = 'If chosen, download the NPR XML data sheet for this Wait Wait episode.')
    parser.add_argument('--level', dest='level', action='store', type=str, default = 'NONE',
                        choices = sorted( logging_dict ),
                        help = 'choose the debug level for downloading NPR Wait Wait episodes or their XML representation of episode info. Can be one of %s. Default is NONE.' % sorted( logging_dict ) )
    parser.add_argument('--justfix', dest='do_justfix', action='store_true', default = False,
                        help = "If chosen, just fix the title of an existing NPR Wait Wait episode's file.")
    args = parser.parse_args( )
    logger.setLevel( logging_dict[ args.level ] )
    fname = get_waitwait(
        args.dirname, get_time_from_datestring( args.date ),
        dump = args.do_dump, justFix = args.do_justfix )
Example #2
0
def _process_freshairs_by_year_tuple(input_tuple):
    outputdir, totnum, verbose, datetimes_order_tuples = input_tuple
    driver = npr_utils.get_chrome_driver()
    for date_s, order in datetimes_order_tuples:
        time0 = time.time()
        try:
            fname = get_freshair(outputdir,
                                 date_s,
                                 order_totnum=(order, totnum),
                                 driver=driver)
            logging.info('processed %s in %0.3f seconds.' %
                         (os.path.basename(fname), time.time() - time0))
        except Exception as e:
            logging.error(str(e))
            logging.error(
                'Could not create Fresh Air episode for date %s for some reason'
                % npr_utils.get_datestring(date_s))
def rm_get_main_url(date_s):
    """
    :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a Saturday.
    :returns: the full RealMedia_ URL for this older `NPR Wait Wait <waitwait_>`_ episode.
    :rtype: str

    .. _RealMedia: https://en.wikipedia.org/wiki/RealMedia
    """
    if not npr_utils.is_saturday(date_s):
        raise ValueError("Error, this date given by '%s' is not a Saturday." %
                         npr_utils.get_datestring(date_s))
    #
    mon_lower = date_s.strftime('%b').lower()
    year = date_s.year
    dsub = date_s.strftime('%y%m%d')
    full_rm_url = 'http://www.npr.org/programs/waitwait/archrndwn/%04d/%s/%s.waitwait.html' % (
        year, mon_lower, dsub)
    return full_rm_url
Example #4
0
def _process_waitwaits_by_year_tuple(input_tuple):
    outputdir, totnum, datetimes_order_tuples = input_tuple
    ww_image = get_waitwait_image()
    driver = npr_utils.get_chrome_driver()
    for date_s, order in datetimes_order_tuples:
        time0 = time.time()
        try:
            fname = get_waitwait(outputdir,
                                 date_s,
                                 order_totnum=(order, totnum),
                                 driver=driver)
            if verbose:
                print('Processed %s in %0.3f seconds.' %
                      (fname, time.time() - time0))
        except Exception as e:
            print(
                'Could not create Wait Wait episode for date %s for some reason.'
                % (npr_utils.get_datestring(date_s)))
def rm_download_file(date_s, outdir=os.getcwd()):
    """
    downloads the RealMedia_ `NPR Wait Wait <waitwait_>`_ episode into a specified directory.
    
    :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a Saturday.
    :param str outdir: the directory into which one downloads the `NPR Fresh Air`_ episodes.
    :returns: the RealMedia_ output file.
    :rtype: str
    """
    decdate = npr_utils.get_decdate(date_s)
    outfile = os.path.join(outdir, 'NPR.WaitWait.%s.rm' % decdate)
    try:
        dsub = date_s.strftime('%Y%m%d')
        rm_url = 'http://download.npr.org/real.npr.na-central/waitwait/%s_waitwait.rm' % dsub
        req = urlopen(rm_url)
        with open(outfile, 'w') as openfile:
            openfile.write(req.read())
        return outfile
    except Exception as e:
        if os.path.isfile(outfile):
            os.remove(outfile)
        raise ValueError(
            "Error, could not download Wait Wait RM file for '%s' into %s." %
            (npr_utils.get_datestring(date_s), outdir))
Example #6
0
def _freshair():
    parser = ArgumentParser()
    parser.add_argument(
        '--dirname',
        dest='dirname',
        type=str,
        action='store',
        default=_default_inputdir,
        help='Name of the directory to store the file. Default is %s.' %
        _default_inputdir)
    parser.add_argument(
        '-d',
        '--date',
        dest='date',
        type=str,
        action='store',
        default=npr_utils.get_datestring(datetime.datetime.now()),
        help=
        'The date, in the form of "January 1, 2014." The default is today\'s date, %s.'
        % npr_utils.get_datestring(datetime.datetime.now()))
    parser.add_argument(
        '--mp3exist',
        dest='mp3_exist',
        action='store_true',
        default=False,
        help=' '.join([
            'If chosen, then do not download the transitional mp3 files.',
            'Use the ones that already exist.'
        ]))
    parser.add_argument(
        '-D',
        '--debug',
        dest='debug',
        action='store_true',
        help='If chosen, dump out NPR Freshair webpage as XML.',
        default=False)
    parser.add_argument(
        '-L',
        '--level',
        dest='level',
        action='store',
        type=str,
        default='NONE',
        choices=sorted(logging_dict),
        help=
        'choose the debug level for downloading NPR Fresh Air episodes or their XML representation of episode info. Can be one of %s. Default is NONE.'
        % sorted(logging_dict))
    parser.add_argument(
        '-r',
        '--relax',
        dest='relax_date_check',
        action='store_true',
        default=False,
        help=
        'If chosen, then do NOT do a date check validation of NPR URL articles.'
    )
    args = parser.parse_args()
    dirname = os.path.expanduser(args.dirname)
    logger.setLevel(logging_dict[args.level])
    fname = freshair.get_freshair(dirname,
                                  npr_utils.get_time_from_datestring(
                                      args.date),
                                  debug=args.debug,
                                  mp3_exist=args.mp3_exist,
                                  relax_date_check=args.relax_date_check)
Example #7
0
def get_title_mp3_urls_attic(outputdir,
                             date_s,
                             debug=False,
                             to_file_debug=True):
    """
    older functionality that uses the `old NPR API` to get an ordered :py:class:`list` of :py:class:`tuple` of stories for an `NPR Fresh Air`_ episode. Here is an example operation,

    .. code-block:: python

       >> date_s = datetime.datetime.strptime('July 31, 2020', '%B %d, %Y' ).date( )
       >> title_mp3_urls = get_title_mp3_urls_attic( date_s )
       >> title_list_mp3_urls
       >> [('Remembering Regis Philbin, Prolific Talk and Game Show Personality',
         'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_01.mp3'),
        ("With 'Folklore,' Taylor Swift Marks Off Her Past and Enters a New Phase",
         'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_02.mp3'),
        ('Remembering Jazz Singer Annie Ross',
         'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_03.mp3'),
        ("'Muppets Now' Proves It's Not Easy to Capture the Old Muppet Magic",
         'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_04.mp3')]

    .. note::

       I was able to get this to work by replacing the ``https://`` in the API URL query with ``http://``.
       
    :param str outputdir: the directory into which one downloads the `NPR Fresh Air`_ episodes.
    :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a weekday.
    :param bool debug: optional argument, if ``True`` returns the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for the `NPR Fresh Air`_ episode, or its file representation. Default is ``False``.
    :param bool to_file_debug: optional argument, if ``True`` dumps out the file representation of the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for the `NPR Fresh Air`_ episode. Default is ``False``.
    :returns: the :py:class:`list` of stories, by order, for the `NPR Fresh Air`_ episode. The first element of each :py:class:`tuple` is the story title, and th second is the MP3_ URL for the story. *However*, if ``debug`` is ``True`` and ``to_file_debug`` is ``True``, returns the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for this `NPR Fresh Air`_ episode.
    
    .. seealso::
    
       * :py:meth:`get_freshair <nprstuff.core.freshair.get_freshair>`.
       * :py:class:`get_title_mp3_urls_working <nprstuff.core.freshair.get_title_mp3_urls_working>`.
    """
    #
    ## download this data into a BeautifulSoup object
    resp = requests.get('http://api.npr.org/query',
                        params={
                            'id': _npr_FreshAir_progid,
                            'date': date_s.strftime('%Y-%m-%d'),
                            'dateType': 'story',
                            'output': 'NPRML',
                            'apiKey': npr_utils.get_api_key()
                        })
    params = {
        'id': _npr_FreshAir_progid,
        'date': date_s.strftime('%Y-%m-%d'),
        'dateType': 'story',
        'output': 'NPRML',
        'apiKey': npr_utils.get_api_key()
    }
    full_URL = 'http://api.npr.org/query?%s' % ('&'.join(
        map(lambda tup: '%s=%s' % (tup[0], tup[1]), params.items())))
    print(full_URL)

    if not resp.ok:
        logging.info('ERROR GETTING FRESH AIR STORY FOR %s' %
                     date_s.strftime('%d %B %Y'))
        return None
    html = BeautifulSoup(resp.content, 'lxml')
    #
    if debug:
        # print 'URL = %s' % nprURL
        if to_file_debug:
            decdate = date_s.strftime('%d.%m.%Y')
            with open(
                    os.path.join(outputdir,
                                 'NPR.FreshAir.tree.%s.xml' % decdate),
                    'w') as openfile:
                openfile.write('%s\n' % html.prettify())
        return html
    #
    ## check for unavailable tag
    if len(html.find_all('unavailable', {'value': 'true'})) != 0:
        unavailable_elem = html.find_all('unavailable', {'value': 'true'})[0]
        if unavailable_elem.text is None:
            print(
                'Could not create Fresh Air episode for date %s because unavailable without a specific reason'
                % npr_utils.get_datestring(date_s))
        else:
            print(
                'Could not create Fresh Air episode for date %s because unavailable for this reason: %s'
                % (npr_utils.get_datestring(date_s),
                   unavailable_elem.text.strip()))
        return None
    #
    ## now get tuple of title to mp3 file
    title_mp3_urls = _process_freshair_titlemp3_tuples(html)
    if title_mp3_urls is None or len(title_mp3_urls) == 0:
        print('Error, could not find any Fresh Air episodes for date %s.' %
              npr_utils.get_datestring(date_s))
        return None
    return title_mp3_urls
Example #8
0
def get_waitwait(outputdir,
                 date_s,
                 order_totnum=None,
                 dump=False,
                 driver=None,
                 justFix=False):
    """
    The main driver method that downloads `NPR Wait Wait <waitwait_>`_ episodes for a given date into a specified output directory.
    
    :param str outputdir: the directory into which one downloads the `NPR Wait Wait <waitwait_>`_ episodes.
    :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a weekday.
    :param tuple order_totnum: optional argument, the :py:class:`tuple` of track number and total number of tracks of `NPR Wait Wait <waitwait_>`_ episodes for that year. If ``None``, then this information is gathered from :py:meth:`get_order_num_saturday_in_year <nprstuff.core.npr_utils.get_order_num_saturday_in_year>`.
    :param bool dump: optional argument, if ``True`` returns the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for the `NPR Wait Wait <waitwait_>`_ episode (and downloads the XML tree into a file). Default is ``False``.
    :param driver: optional argument, the :py:class:`Webdriver <selenium.webdriver.remote.webdriver.WebDriver>` used for webscraping and querying (instead of using a functional API) for `NPR Wait Wait <waitwait_>`_ episodes. If ``None``, then a new :py:class:`Webdriver <selenium.webdriver.remote.webdriver.WebDriver>` will be defined and used within this method's scope.
    :param bool justFix: optional argument, if ``True`` and if `NPR Wait Wait <waitwait_>`_ file exists, then just change the title of the M4A_ file. Default is ``False``.

    :returns: the name of the `NPR Wait Wait <waitwait_>`_ episode file.
    :rtype: str
    """
    # check if outputdir is a directory
    if not os.path.isdir(outputdir):
        raise ValueError("Error, %s is not a directory." % outputdir)

    # check if actually saturday
    if not npr_utils.is_saturday(date_s):
        raise ValueError("Error, date = %s not a Saturday." %
                         npr_utils.get_datestring(date_s))

    #
    ## if driver is None
    if driver is None: driver = npr_utils.get_chrome_driver()

    exec_dict = npr_utils.find_necessary_executables()
    assert (exec_dict is not None)
    avconv_exec = exec_dict['avconv']
    logging.debug('avconv exec = %s.' % avconv_exec)

    if order_totnum is None:
        order_totnum = npr_utils.get_order_number_saturday_in_year(date_s)
    order_in_year, tot_in_year = order_totnum

    file_data = get_waitwait_image()

    year = date_s.year
    decdate = npr_utils.get_decdate(date_s)
    m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate)
    logging.info(
        'INFO TO GET FIGURE OUT get_title_mp3s_url_working: %s, %s, %s, %s' %
        (m4afile, date_s, driver, dump))
    if year >= 2006:
        data = get_title_mp3_urls_working('.', date_s, driver, dump=dump)
        if dump: return data
        title_mp3_urls = data
        if title_mp3_urls is None or len(title_mp3_urls) == 0: return None
        titles, songurls = list(zip(*title_mp3_urls))
        title = date_s.strftime('%B %d, %Y')
        title = '%s: %s.' % (title, '; '.join(
            ['%d) %s' % (num + 1, titl) for (num, titl) in enumerate(titles)]))
        if justFix:
            if not os.path.isfile(m4afile):
                print("Error, %s does not exist." % os.path.basename(m4afile))
                return
            mp4tags = mutagen.mp4.MP4(m4afile)
            mp4tags.tags['\xa9nam'] = [
                title,
            ]
            mp4tags.save()
            logging.info('fixed title for %s.' % m4afile)
            return m4afile

        logging.info('got here in NPR Wait Wait episode %s, title = %s.' %
                     (date_s, title))

        # temporary directory
        tmpdir = tempfile.mkdtemp()
        m4afile_temp = os.path.join(tmpdir, 'NPR.WaitWait.%s.m4a' % decdate)
        outfiles = [
            os.path.join(tmpdir, 'waitwait.%s.%d.mp3' % (decdate, num + 1))
            for (num, mp3url) in enumerate(songurls)
        ]

        # download those files
        with multiprocessing.Pool(processes=min(multiprocessing.cpu_count(),
                                                len(songurls))) as pool:
            outfiles = sorted(
                filter(None, pool.map(_download_file, zip(songurls,
                                                          outfiles))))

        # now convert to m4a file
        fnames = list(
            map(lambda filename: filename.replace(' ', '\ '), outfiles))
        avconv_concat_cmd = 'concat:%s' % '|'.join(fnames)
        split_cmd = [
            avconv_exec, '-y', '-i', avconv_concat_cmd, '-ar', '44100', '-ac',
            '2', '-threads',
            '%d' % multiprocessing.cpu_count(), '-strict', 'experimental',
            '-acodec', 'aac', m4afile_temp
        ]
        logging.info("here is the split command: %s." % split_cmd)
        proc = subprocess.Popen(split_cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout_val, stderr_val = proc.communicate()
        logging.debug("stdout_val: %s." % stdout_val)
        logging.debug("stderr_val: %s." % stderr_val)
        #
        ## remove mp3 files
        for filename in outfiles:
            os.remove(filename)
    else:
        tmpdir = tempfile.mkdtemp()
        title = waitwait_realmedia.rm_get_title_from_url(date_s)
        rmfile = waitwait_realmedia.rm_download_file(date_s, outdir=tmpdir)
        wavfile = waitwait_realmedia.rm_create_wav_file(date_s,
                                                        rmfile,
                                                        outdir=tmpdir)
        os.remove(rmfile)
        #
        ## now convert to m4a file
        m4afile_temp = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate)
        split_cmd = [
            avconv_exec, '-y', '-i', wavfile, '-ar', '44100', '-ac', '2',
            '-threads',
            '%d' % multiprocessing.cpu_count(), '-strict', 'experimental',
            '-acodec', 'aac', m4afile_temp
        ]
        proc = subprocess.Popen(split_cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout_val, stderr_val = proc.communicate()
        #
        ## remove wav file
        os.remove(wavfile)
    #
    ## now put in metadata
    mp4tags = mutagen.mp4.MP4(m4afile_temp)
    mp4tags.tags['\xa9nam'] = [
        title,
    ]
    mp4tags.tags['\xa9alb'] = [
        "Wait Wait...Don't Tell Me: %d" % year,
    ]
    mp4tags.tags['\xa9ART'] = [
        'Peter Sagal',
    ]
    mp4tags.tags['\xa9day'] = [
        '%d' % year,
    ]
    mp4tags.tags['\xa9cmt'] = [
        "more info at : NPR Web site",
    ]
    mp4tags.tags['trkn'] = [
        (order_in_year, tot_in_year),
    ]
    mp4tags.tags['covr'] = [
        mutagen.mp4.MP4Cover(file_data, mutagen.mp4.MP4Cover.FORMAT_PNG),
    ]
    mp4tags.tags['\xa9gen'] = [
        'Podcast',
    ]
    mp4tags.save()
    os.chmod(m4afile_temp, 0o644)
    #
    ## now copy to actual location and remove temp directory
    shutil.copy(m4afile_temp, m4afile)
    shutil.rmtree(tmpdir)
    return m4afile