def test_get_sql_date_from_epoch(): assert get_sql_date_from_epoch(int( time.time())) == datetime.datetime.today().strftime( '%Y-%m-%d %H:%M:%S') assert get_sql_date_from_epoch(0) == datetime.datetime.fromtimestamp( 0).strftime('%Y-%m-%d %H:%M:%S') # noinspection PyTypeChecker assert get_sql_date_from_epoch( 'badger') == datetime.datetime.fromtimestamp(0).strftime( '%Y-%m-%d %H:%M:%S')
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query(""" SELECT content_delay FROM media WHERE media_id = %(media_id)s """, {'media_id': story['media_id']}).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch(download_at_timestamp) db.create(table='downloads', insert_hash=download)
def publish_date_sql(self) -> Optional[str]: """Return item publication date as a PostgreSQL-formatted string in a local timezone.""" postgresql_date = None published_tuple = self._parsed_publish_date() if published_tuple: # FIXME unfortunately, Perl's implementation would make the timezone vanish, so dates & times would get # stored in machine's timezone in PostgreSQL (which is set to America/New_York in production). We haven't # added timezone to stories.publish_date column yet so we have to keep the present buggy behavior here. timestamp = int(calendar.timegm(published_tuple)) postgresql_date = get_sql_date_from_epoch(timestamp) return postgresql_date
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query( """ SELECT content_delay FROM media WHERE media_id = %(media_id)s """, { 'media_id': story['media_id'] }).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch( download_at_timestamp) db.create(table='downloads', insert_hash=download)
def test_add_stale_feeds(): db = connect_to_db() medium = create_test_medium(db, 'foo') pending_feeds = [] feed = { 'media_id': medium['media_id'], 'name': 'null last download', 'url': 'http://null last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': None } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last download', 'url': 'http://recent last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now() } db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last new story', 'url': 'http://recent last new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now(), 'last_new_story_time': sql_now() } db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': '5 minute new story', 'url': 'http://5 minute new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(int(time.time()) - 300), 'last_new_story_time': get_sql_date_from_epoch(int(time.time()) - 300), } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'old last download', 'url': 'http://old last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(int(time.time()) - (86400 * 10)) } feed = db.create('feeds', feed) pending_feeds.append(feed) _add_stale_feeds(db) num_pending_downloads = db.query( "select count(*) from downloads where state = 'pending'").flat()[0] assert num_pending_downloads == len(pending_feeds) for feed in pending_feeds: exists = db.query( "select * from downloads where state = 'pending' and feeds_id = %(a)s", { 'a': feed['feeds_id'] }).hash() assert exists, "download for feed %s added" % feed['name']
def test_get_sql_date_from_epoch(): assert get_sql_date_from_epoch(int(time.time())) == datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S') assert get_sql_date_from_epoch(0) == datetime.datetime.fromtimestamp(0).strftime('%Y-%m-%d %H:%M:%S') # noinspection PyTypeChecker assert get_sql_date_from_epoch('badger') == datetime.datetime.fromtimestamp(0).strftime('%Y-%m-%d %H:%M:%S')
def _get_stories_from_univision_feed(cls, content: str, media_id: int) -> List[Dict[str, Any]]: """Parse the feed. Return a (non-db-backed) story dict for each story found in the feed.""" content = decode_object_from_bytes_if_needed(content) if isinstance(media_id, bytes): media_id = decode_object_from_bytes_if_needed(media_id) media_id = int(media_id) if not content: raise McCrawlerFetcherSoftError("Feed content is empty or undefined.") try: feed_json = decode_json(content) except Exception as ex: raise McCrawlerFetcherSoftError(f"Unable to decode Univision feed JSON: {ex}") try: # Intentionally raise exception on KeyError: if not feed_json['status'] == 'success': raise McCrawlerFetcherSoftError(f"Univision feed response is not 'success': {content}") except Exception as ex: raise McCrawlerFetcherSoftError(f"Unable to verify Univision feed status: {ex}") try: # Intentionally raise exception on KeyError: feed_items = feed_json.get('data', None).get('items', None) except Exception as ex: raise McCrawlerFetcherSoftError(f"Univision feed response does not have 'data'/'items' key: {ex}") stories = [] for item in feed_items: url = item.get('url', None) if not url: # Some items in the feed don't have their URLs set log.warning(f"'url' for item is not set: {item}") continue # sic -- we take "uid" (without "g") and call it "guid" (with "g") guid = item.get('uid', None) if not guid: raise McCrawlerFetcherSoftError(f"Item does not have its 'uid' set: {item}") title = item.get('title', '(no title)') description = item.get('description', '') try: # Intentionally raise exception on KeyError: str_publish_date = item['publishDate'] publish_timestamp = str2time_21st_century(str_publish_date) publish_date = get_sql_date_from_epoch(publish_timestamp) except Exception as ex: # Die for good because Univision's dates should be pretty predictable raise McCrawlerFetcherSoftError(f"Unable to parse item's {item} publish date: {ex}") log.debug(f"Story found in Univision feed: URL '{url}', title '{title}', publish date '{publish_date}'") stories.append({ 'url': url, 'guid': guid, 'media_id': media_id, 'publish_date': publish_date, 'title': title, 'description': description, }) return stories
def test_add_stale_feeds(self) -> None: """Test _add_stale_feeds().""" db = self.db() medium = mediawords.test.db.create.create_test_medium(db, 'foo') pending_feeds = [] feed = { 'media_id': medium['media_id'], 'name': 'null last download', 'url': 'http://null last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': None } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last download', 'url': 'http://recent last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now() } feed = db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last new story', 'url': 'http://recent last new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now(), 'last_new_story_time': sql_now() } feed = db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': '5 minute new story', 'url': 'http://5 minute new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(time.time() - 300), 'last_new_story_time': get_sql_date_from_epoch(time.time() - 300), } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'old last download', 'url': 'http://old last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(time.time() - (86400 * 10)) } feed = db.create('feeds', feed) pending_feeds.append(feed) mediawords.crawler.provider._add_stale_feeds(db) num_pending_downloads = db.query("select count(*) from downloads where state = 'pending'").flat()[0] assert num_pending_downloads == len(pending_feeds) for feed in pending_feeds: exists = db.query( "select * from downloads where state = 'pending' and feeds_id = %(a)s", {'a': feed['feeds_id']}).hash() assert exists, "download for feed %s added" % feed['name']