コード例 #1
0
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]:

        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        media_id = get_media_id(db=db, download=download)

        stories = self._get_stories_from_univision_feed(content=content, media_id=media_id)

        story_ids = []

        for story in stories:
            story = add_story_and_content_download(db=db, story=story, parent_download=download)
            if story:
                if story.get('is_new', None):
                    story_ids.append(story['stories_id'])

        return story_ids
コード例 #2
0
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed
        download; if there is a new URL, check whether each story is new, and if so add it to the database and add a
        pending download for it. Return new stories that were found in the feed.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        media_id = get_media_id(db=db, download=download)
        download_time = download['download_time']

        try:
            stories = self._get_stories_from_syndicated_feed(
                content=content,
                media_id=media_id,
                download_time=download_time,
            )
        except Exception as ex:
            raise McCrawlerFetcherSoftError(
                f"Error processing feed for {download['url']}: {ex}")

        if stories_checksum_matches_feed(db=db,
                                         feeds_id=download['feeds_id'],
                                         stories=stories):
            return []

        new_story_ids = []
        for story in stories:

            # FIXME None of the helpers like keys they don't know about
            story_without_enclosures = story.copy()
            story_without_enclosures.pop('enclosures')

            if self._add_content_download_for_new_stories():
                added_story = add_story_and_content_download(
                    db=db,
                    story=story_without_enclosures,
                    parent_download=download,
                )
            else:
                added_story = add_story(
                    db=db,
                    story=story_without_enclosures,
                    feeds_id=download['feeds_id'],
                )

            # We might have received None due to a GUID conflict
            if added_story:

                stories_id = added_story['stories_id']
                story_is_new = added_story.get('is_new', False)

                if story_is_new:

                    # Add all of the enclosures
                    for enclosure in story['enclosures']:
                        # ...provided that the URL is set
                        if enclosure['url']:

                            db.query(
                                """
                                INSERT INTO story_enclosures (stories_id, url, mime_type, length)
                                VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s)
                                
                                -- Some stories have multiple enclosures pointing to the same URL
                                ON CONFLICT (stories_id, url) DO NOTHING
                            """, {
                                    'stories_id': stories_id,
                                    'url': enclosure['url'],
                                    'mime_type': enclosure['mime_type'],
                                    'length': enclosure['length'],
                                })

                    # Append to the list of newly added storyes
                    new_story_ids.append(stories_id)

        log.info(
            f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}"
        )

        return new_story_ids
コード例 #3
0
 def test_get_media_id(self):
     media_id = get_media_id(db=self._db, download=self.test_download)
     assert media_id == self.test_medium['media_id']