Beispiel #1
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        try:
            db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)
        except McTupleAlreadyMovedError as ex:
            # Some attempts to set the download's row to "fetching" fail with:
            #
            #   "tuple to be locked was already moved to another partition due to concurrent update"
            #
            # If that happens, we assume that some other fetcher instance somehow got to the download first and do
            # nothing
            log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}")
            return None
        except Exception as ex:
            # Raise further on misc. errors
            raise ex

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Beispiel #2
0
    def _get_stories_from_syndicated_feed(cls,
                                          content: str,
                                          media_id: int,
                                          download_time: str) -> List[Dict[str, Any]]:
        """Parse the feed. Return a list of (non-database-backed) story dicts for each story found in the feed."""
        feed = parse_feed(content)
        if not feed:
            raise McCrawlerFetcherSoftError("Unable to parse feed.")

        stories = []

        for item in feed.items():

            url = item.link()
            if not url:
                log.warning(f"URL for feed item is empty, skipping")
                continue

            guid = item.guid_if_valid()
            if not guid:
                guid = url

            title = item.title()
            if not title:
                title = '(no title)'

            description = item.description()

            publish_date = item.publish_date_sql()
            if not publish_date:
                publish_date = download_time

            enclosures = []

            for enclosure in item.enclosures():
                enclosures.append({
                    'url': enclosure.url(),
                    'mime_type': enclosure.mime_type(),
                    'length': enclosure.length(),
                })

            story = {
                'url': url,
                'guid': guid,
                'media_id': media_id,
                'publish_date': publish_date,
                'title': title,
                'description': description,
                'enclosures': enclosures,
            }
            stories.append(story)

        return stories
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Handle feeds of type 'web_page' by just creating a story to associate with the content.

        Web page feeds are feeds that consist of a web page that we download once a week and add as a story.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        feeds_id = download['feeds_id']

        feed = db.find_by_id(table='feeds', object_id=feeds_id)

        title = html_title(html=content, fallback='(no title)')
        title += '[' + sql_now() + ']'

        guid = f"{str(int(time.time()))}:{download['url']}"[0:1024]

        new_story = {
            'url': download['url'],
            'guid': guid,
            'media_id': feed['media_id'],
            'publish_date': sql_now(),
            'title': title,
        }

        story = add_story(db=db, story=new_story, feeds_id=feeds_id)
        if not story:
            raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}")

        db.query(
            """
            UPDATE downloads
            SET stories_id = %(stories_id)s,
                type = 'content'
            WHERE downloads_id = %(downloads_id)s
        """, {
                'stories_id': story['stories_id'],
                'downloads_id': download['downloads_id'],
            })

        # A webpage that was just fetched is also a story
        story_ids = [
            story['stories_id'],
        ]

        return story_ids
Beispiel #4
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads',
                        object_id=download['downloads_id'],
                        update_hash=download)

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Beispiel #5
0
    def _get_stories_from_univision_feed(cls, content: str, media_id: int) -> List[Dict[str, Any]]:
        """Parse the feed. Return a (non-db-backed) story dict for each story found in the feed."""
        content = decode_object_from_bytes_if_needed(content)
        if isinstance(media_id, bytes):
            media_id = decode_object_from_bytes_if_needed(media_id)

        media_id = int(media_id)

        if not content:
            raise McCrawlerFetcherSoftError("Feed content is empty or undefined.")

        try:
            feed_json = decode_json(content)
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Unable to decode Univision feed JSON: {ex}")

        try:
            # Intentionally raise exception on KeyError:
            if not feed_json['status'] == 'success':
                raise McCrawlerFetcherSoftError(f"Univision feed response is not 'success': {content}")
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Unable to verify Univision feed status: {ex}")

        try:
            # Intentionally raise exception on KeyError:
            feed_items = feed_json.get('data', None).get('items', None)
        except Exception as ex:
            raise McCrawlerFetcherSoftError(f"Univision feed response does not have 'data'/'items' key: {ex}")

        stories = []

        for item in feed_items:
            url = item.get('url', None)
            if not url:
                # Some items in the feed don't have their URLs set
                log.warning(f"'url' for item is not set: {item}")
                continue

            # sic -- we take "uid" (without "g") and call it "guid" (with "g")
            guid = item.get('uid', None)
            if not guid:
                raise McCrawlerFetcherSoftError(f"Item does not have its 'uid' set: {item}")

            title = item.get('title', '(no title)')
            description = item.get('description', '')

            try:
                # Intentionally raise exception on KeyError:
                str_publish_date = item['publishDate']
                publish_timestamp = str2time_21st_century(str_publish_date)
                publish_date = get_sql_date_from_epoch(publish_timestamp)
            except Exception as ex:
                # Die for good because Univision's dates should be pretty predictable
                raise McCrawlerFetcherSoftError(f"Unable to parse item's {item} publish date: {ex}")

            log.debug(f"Story found in Univision feed: URL '{url}', title '{title}', publish date '{publish_date}'")
            stories.append({
                'url': url,
                'guid': guid,
                'media_id': media_id,
                'publish_date': publish_date,
                'title': title,
                'description': description,
            })

        return stories
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed
        download; if there is a new URL, check whether each story is new, and if so add it to the database and add a
        pending download for it. Return new stories that were found in the feed.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        media_id = get_media_id(db=db, download=download)
        download_time = download['download_time']

        try:
            stories = self._get_stories_from_syndicated_feed(
                content=content,
                media_id=media_id,
                download_time=download_time,
            )
        except Exception as ex:
            raise McCrawlerFetcherSoftError(
                f"Error processing feed for {download['url']}: {ex}")

        if stories_checksum_matches_feed(db=db,
                                         feeds_id=download['feeds_id'],
                                         stories=stories):
            return []

        new_story_ids = []
        for story in stories:

            # FIXME None of the helpers like keys they don't know about
            story_without_enclosures = story.copy()
            story_without_enclosures.pop('enclosures')

            if self._add_content_download_for_new_stories():
                added_story = add_story_and_content_download(
                    db=db,
                    story=story_without_enclosures,
                    parent_download=download,
                )
            else:
                added_story = add_story(
                    db=db,
                    story=story_without_enclosures,
                    feeds_id=download['feeds_id'],
                )

            # We might have received None due to a GUID conflict
            if added_story:

                stories_id = added_story['stories_id']
                story_is_new = added_story.get('is_new', False)

                if story_is_new:

                    # Add all of the enclosures
                    for enclosure in story['enclosures']:
                        # ...provided that the URL is set
                        if enclosure['url']:

                            db.query(
                                """
                                INSERT INTO story_enclosures (stories_id, url, mime_type, length)
                                VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s)
                                
                                -- Some stories have multiple enclosures pointing to the same URL
                                ON CONFLICT (stories_id, url) DO NOTHING
                            """, {
                                    'stories_id': stories_id,
                                    'url': enclosure['url'],
                                    'mime_type': enclosure['mime_type'],
                                    'length': enclosure['length'],
                                })

                    # Append to the list of newly added storyes
                    new_story_ids.append(stories_id)

        log.info(
            f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}"
        )

        return new_story_ids