def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' try: db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) except McTupleAlreadyMovedError as ex: # Some attempts to set the download's row to "fetching" fail with: # # "tuple to be locked was already moved to another partition due to concurrent update" # # If that happens, we assume that some other fetcher instance somehow got to the download first and do # nothing log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}") return None except Exception as ex: # Raise further on misc. errors raise ex ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def _get_stories_from_syndicated_feed(cls, content: str, media_id: int, download_time: str) -> List[Dict[str, Any]]: """Parse the feed. Return a list of (non-database-backed) story dicts for each story found in the feed.""" feed = parse_feed(content) if not feed: raise McCrawlerFetcherSoftError("Unable to parse feed.") stories = [] for item in feed.items(): url = item.link() if not url: log.warning(f"URL for feed item is empty, skipping") continue guid = item.guid_if_valid() if not guid: guid = url title = item.title() if not title: title = '(no title)' description = item.description() publish_date = item.publish_date_sql() if not publish_date: publish_date = download_time enclosures = [] for enclosure in item.enclosures(): enclosures.append({ 'url': enclosure.url(), 'mime_type': enclosure.mime_type(), 'length': enclosure.length(), }) story = { 'url': url, 'guid': guid, 'media_id': media_id, 'publish_date': publish_date, 'title': title, 'description': description, 'enclosures': enclosures, } stories.append(story) return stories
def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: """ Handle feeds of type 'web_page' by just creating a story to associate with the content. Web page feeds are feeds that consist of a web page that we download once a week and add as a story. """ download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) feeds_id = download['feeds_id'] feed = db.find_by_id(table='feeds', object_id=feeds_id) title = html_title(html=content, fallback='(no title)') title += '[' + sql_now() + ']' guid = f"{str(int(time.time()))}:{download['url']}"[0:1024] new_story = { 'url': download['url'], 'guid': guid, 'media_id': feed['media_id'], 'publish_date': sql_now(), 'title': title, } story = add_story(db=db, story=new_story, feeds_id=feeds_id) if not story: raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}") db.query( """ UPDATE downloads SET stories_id = %(stories_id)s, type = 'content' WHERE downloads_id = %(downloads_id)s """, { 'stories_id': story['stories_id'], 'downloads_id': download['downloads_id'], }) # A webpage that was just fetched is also a story story_ids = [ story['stories_id'], ] return story_ids
def fetch_download(self, db: DatabaseHandler, download: dict) -> Response: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def _get_stories_from_univision_feed(cls, content: str, media_id: int) -> List[Dict[str, Any]]: """Parse the feed. Return a (non-db-backed) story dict for each story found in the feed.""" content = decode_object_from_bytes_if_needed(content) if isinstance(media_id, bytes): media_id = decode_object_from_bytes_if_needed(media_id) media_id = int(media_id) if not content: raise McCrawlerFetcherSoftError("Feed content is empty or undefined.") try: feed_json = decode_json(content) except Exception as ex: raise McCrawlerFetcherSoftError(f"Unable to decode Univision feed JSON: {ex}") try: # Intentionally raise exception on KeyError: if not feed_json['status'] == 'success': raise McCrawlerFetcherSoftError(f"Univision feed response is not 'success': {content}") except Exception as ex: raise McCrawlerFetcherSoftError(f"Unable to verify Univision feed status: {ex}") try: # Intentionally raise exception on KeyError: feed_items = feed_json.get('data', None).get('items', None) except Exception as ex: raise McCrawlerFetcherSoftError(f"Univision feed response does not have 'data'/'items' key: {ex}") stories = [] for item in feed_items: url = item.get('url', None) if not url: # Some items in the feed don't have their URLs set log.warning(f"'url' for item is not set: {item}") continue # sic -- we take "uid" (without "g") and call it "guid" (with "g") guid = item.get('uid', None) if not guid: raise McCrawlerFetcherSoftError(f"Item does not have its 'uid' set: {item}") title = item.get('title', '(no title)') description = item.get('description', '') try: # Intentionally raise exception on KeyError: str_publish_date = item['publishDate'] publish_timestamp = str2time_21st_century(str_publish_date) publish_date = get_sql_date_from_epoch(publish_timestamp) except Exception as ex: # Die for good because Univision's dates should be pretty predictable raise McCrawlerFetcherSoftError(f"Unable to parse item's {item} publish date: {ex}") log.debug(f"Story found in Univision feed: URL '{url}', title '{title}', publish date '{publish_date}'") stories.append({ 'url': url, 'guid': guid, 'media_id': media_id, 'publish_date': publish_date, 'title': title, 'description': description, }) return stories
def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: """ Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed download; if there is a new URL, check whether each story is new, and if so add it to the database and add a pending download for it. Return new stories that were found in the feed. """ download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) media_id = get_media_id(db=db, download=download) download_time = download['download_time'] try: stories = self._get_stories_from_syndicated_feed( content=content, media_id=media_id, download_time=download_time, ) except Exception as ex: raise McCrawlerFetcherSoftError( f"Error processing feed for {download['url']}: {ex}") if stories_checksum_matches_feed(db=db, feeds_id=download['feeds_id'], stories=stories): return [] new_story_ids = [] for story in stories: # FIXME None of the helpers like keys they don't know about story_without_enclosures = story.copy() story_without_enclosures.pop('enclosures') if self._add_content_download_for_new_stories(): added_story = add_story_and_content_download( db=db, story=story_without_enclosures, parent_download=download, ) else: added_story = add_story( db=db, story=story_without_enclosures, feeds_id=download['feeds_id'], ) # We might have received None due to a GUID conflict if added_story: stories_id = added_story['stories_id'] story_is_new = added_story.get('is_new', False) if story_is_new: # Add all of the enclosures for enclosure in story['enclosures']: # ...provided that the URL is set if enclosure['url']: db.query( """ INSERT INTO story_enclosures (stories_id, url, mime_type, length) VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s) -- Some stories have multiple enclosures pointing to the same URL ON CONFLICT (stories_id, url) DO NOTHING """, { 'stories_id': stories_id, 'url': enclosure['url'], 'mime_type': enclosure['mime_type'], 'length': enclosure['length'], }) # Append to the list of newly added storyes new_story_ids.append(stories_id) log.info( f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}" ) return new_story_ids