def parse_element_to_json(element): content = {} if element.img: content = {"type": "image", "content": element.img.get('src')} return content elif element.ul: content = { "type": "links", "content": [link.get('href') for link in element.ul.find_all('a')] } return content elif len(element.get_text().strip()) != 0: content = {"type": "text", "content": element.get_text().strip()} return content
def parse_news_item(self, item_element: element.Tag, base_url: str) -> NewsItem: title_element = item_element.find('h3') title_link = title_element.find('a') url = title_link['href'] if not url: raise FormatError('No URL found for article') else: url = urljoin(base_url, url) title = title_link.get_text(strip=True) if not title: raise FormatError('No title content found') date_string = item_element.find(class_='date').get_text().strip() date_string = date_string.replace('Posted on: ', '') date = parse_datetime(date_string) categories = [] for category_element in item_element.select('.category'): categories.append(category_element.get_text(strip=True)) # Remove the link so we can more easily pull out summary text category_link = category_element.find_parent('a') or category_element category_link.extract() more_link = item_element.find(class_='more') if more_link: more_link.extract() # TODO: can/should we preserve HTML here? summaries = (element if isinstance(element, str) else element.get_text(strip=True) for element in title_element.next_siblings) summaries = (SUMMARY_PREFIX_PATTERN.sub('', summary) for summary in summaries) summary = ' '.join(summaries).strip() return NewsItem(id=url, url=url, title=title, date_published=date, summary=summary)
def is_news_heading(self, element: element.Tag) -> bool: return (HEADING_PATTERN.match(element.name) and MONTH_HEADING_PATTERN.match(element.get_text())) is not None
def is_news_heading(self, element: element.Tag) -> bool: return bool(element.name == 'h3' and MONTH_HEADING_PATTERN.match(element.get_text()))