def parse_item(item_soup, stem_mapping): """Get information about a single HI episode. @param item_soup: Soup containing information about a single HI episode. @type item_soup: bs4.BeautifulSoup @return: Dictionary describing the episode. Contains keys name (str value), date (datetime.date), loc (url - str value), duration (seconds - int), and orig_tags (tags applied to episode - list of str) @rtype: dict """ item_date = common.interpret_2822_date( item_soup.find('pubdate').contents[0] ) duration_soup = item_soup.find('itunes:duration') if duration_soup: duration = common.interpret_duration( item_soup.find('itunes:duration').contents[0] ) else: duration = 7200 title = item_soup.find('title').contents[0] return { 'name': title, 'date': item_date, 'loc': '', 'duration': duration, 'orig_tags': get_item_tags(title, item_soup, stem_mapping) }
def parse_episode_page(loc, contents): """Parse a page describing a single podcast episode. @param loc: The URL of this page. @type loc: basestring @param contents: The raw HTML contents of the episode page from which episode information should be parsed. @type contents: basestring @return: Dictionary describing the episode. Contains keys name (str value), date (datetime.date), loc (url - str value), duration (seconds - int), and orig_tags (tags applied to episode - list of str) @rtype: dict """ soup = bs4.BeautifulSoup(contents) header = soup.find(class_='centerPosts') title = header.find('strong').contents[0] date_str = soup.find(class_='pdateS').find('em').contents[0] date_components = date_str.replace(',', ' ').split(' ') year = int(date_components[2]) month = common.MONTH_ABBRV[date_components[0]] day = int(date_components[1]) episode_date = datetime.date(year, month, day) tags = sorted(set(map( lambda x: x.contents[0], soup.findAll('a', rel='tag') ))) duration_str = soup.find(class_='podpress_mediafile_dursize').contents[0] duration_str_clean = duration_str.replace('[ ', '').replace(' ]', '') duration = common.interpret_duration(duration_str_clean) return { 'title': title, 'date': episode_date, 'tags': tags, 'loc': loc, 'duration': duration }
def process_item(item_soup): """Parse information about a single podcast episode. @param item_soup: Soup containing information about a single podcast episode. @type item_soup: bs4.BeautifulSoup @return: Dictionary describing the episode. Contains keys name (str value), date (datetime.date), loc (url - str value), duration (seconds - int), and orig_tags (tags applied to episode - list of str) @rtype: dict """ title = item_soup.find('title').contents[0].strip() loc = item_soup.find('guid').contents[0] pub_date_raw = item_soup.find('pubdate').contents[0] pub_date = common.interpret_2822_date(pub_date_raw) tags = map( lambda x: x.contents[0], item_soup.findAll('category') ) duration_soup = item_soup.find('itunes:duration') if duration_soup == None: duration = 1800 if 'shorts' in tags else 3600 else: duration_str = duration_soup.contents[0] duration = common.interpret_duration(duration_str) return { 'name': title, 'date': pub_date, 'tags': sorted(set(tags)), 'loc': loc, 'duration': duration }