def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) media_id = get_media_id(db=db, download=download) stories = self._get_stories_from_univision_feed(content=content, media_id=media_id) story_ids = [] for story in stories: story = add_story_and_content_download(db=db, story=story, parent_download=download) if story: if story.get('is_new', None): story_ids.append(story['stories_id']) return story_ids
def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: """ Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed download; if there is a new URL, check whether each story is new, and if so add it to the database and add a pending download for it. Return new stories that were found in the feed. """ download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) media_id = get_media_id(db=db, download=download) download_time = download['download_time'] try: stories = self._get_stories_from_syndicated_feed( content=content, media_id=media_id, download_time=download_time, ) except Exception as ex: raise McCrawlerFetcherSoftError( f"Error processing feed for {download['url']}: {ex}") if stories_checksum_matches_feed(db=db, feeds_id=download['feeds_id'], stories=stories): return [] new_story_ids = [] for story in stories: # FIXME None of the helpers like keys they don't know about story_without_enclosures = story.copy() story_without_enclosures.pop('enclosures') if self._add_content_download_for_new_stories(): added_story = add_story_and_content_download( db=db, story=story_without_enclosures, parent_download=download, ) else: added_story = add_story( db=db, story=story_without_enclosures, feeds_id=download['feeds_id'], ) # We might have received None due to a GUID conflict if added_story: stories_id = added_story['stories_id'] story_is_new = added_story.get('is_new', False) if story_is_new: # Add all of the enclosures for enclosure in story['enclosures']: # ...provided that the URL is set if enclosure['url']: db.query( """ INSERT INTO story_enclosures (stories_id, url, mime_type, length) VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s) -- Some stories have multiple enclosures pointing to the same URL ON CONFLICT (stories_id, url) DO NOTHING """, { 'stories_id': stories_id, 'url': enclosure['url'], 'mime_type': enclosure['mime_type'], 'length': enclosure['length'], }) # Append to the list of newly added storyes new_story_ids.append(stories_id) log.info( f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}" ) return new_story_ids
def test_get_media_id(self): media_id = get_media_id(db=self._db, download=self.test_download) assert media_id == self.test_medium['media_id']