def test_add_story_description_unset(): """Test adding a story without a description being set.""" db = connect_to_db() medium = create_test_medium(db=db, label='test') feed = create_test_feed(db=db, label='test', medium=medium) story = { 'url': 'http://test', 'guid': 'http://test', 'media_id': medium['media_id'], 'title': "test", # stories.description can be NULL so it's a valid value: 'description': None, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', } add_story(db=db, story=story, feeds_id=feed['feeds_id']) assert len(db.select(table='stories', what_to_select='*').hashes()) == 1 assert len( db.select(table='feeds_stories_map', what_to_select='*').hashes()) == 1
def test_add_story_full_text_rss(self): """Test add_story() with only parent media's full_text_rss set to True.""" media_id = self.test_medium['media_id'] feeds_id = self.test_feed['feeds_id'] self.db().update_by_id( table='media', object_id=media_id, update_hash={'full_text_rss': True}, ) story = { 'media_id': media_id, 'url': 'http://add.story/', 'guid': 'http://add.story/', 'title': 'test add story', 'description': 'test add story', 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', # 'full_text_rss' to be inferred from parent "media" item } added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id) assert added_story assert 'stories_id' in added_story assert story['url'] == added_story['url'] assert added_story['full_text_rss'] is True
def test_add_story(self): """Test add_story().""" media_id = self.test_medium['media_id'] feeds_id = self.test_feed['feeds_id'] # Basic story story = { 'media_id': media_id, 'url': 'http://add.story/', 'guid': 'http://add.story/', 'title': 'test add story', 'description': 'test add story', 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, } added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id) assert added_story assert 'stories_id' in added_story assert story['url'] == added_story['url'] assert added_story['full_text_rss'] is True feeds_stories_tag_mapping = self.db().select( table='feeds_stories_map', what_to_select='*', condition_hash={ 'stories_id': added_story['stories_id'], 'feeds_id': feeds_id, }).hashes() assert len(feeds_stories_tag_mapping) == 1 story_urls = self.db().query( "select * from story_urls where stories_id = %(a)s", { 'a': added_story['stories_id'] }).hashes() assert len(story_urls) == 1 assert story_urls[0]['url'] == added_story['url'] # Try adding a duplicate story dup_story = add_story(db=self.db(), story=story, feeds_id=feeds_id) assert dup_story is not None assert dup_story['stories_id'] == added_story['stories_id']
def test_add_story(self): """Test add_story().""" media_id = self.test_medium['media_id'] feeds_id = self.test_feed['feeds_id'] # Basic story story = { 'media_id': media_id, 'url': 'http://add.story/', 'guid': 'http://add.story/', 'title': 'test add story', 'description': 'test add story', 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, } added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id) assert added_story assert 'stories_id' in added_story assert story['url'] == added_story['url'] assert added_story['full_text_rss'] is True feeds_stories_tag_mapping = self.db().select( table='feeds_stories_map', what_to_select='*', condition_hash={ 'stories_id': added_story['stories_id'], 'feeds_id': feeds_id, }).hashes() assert len(feeds_stories_tag_mapping) == 1 # Try adding a duplicate story added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id) assert added_story is None # Try adding a duplicate story with explicit "is new" testing disabled added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id, skip_checking_if_new=True) assert added_story is None
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None: """Given a ap story return by get_new_stories(), add it to the database.""" ap_medium = db.query( """ SELECT * FROM media WHERE name = %(medium_name)s """, { 'medium_name': AP_MEDIUM_NAME, }).hash() ap_feed = { 'media_id': ap_medium['media_id'], 'name': 'API Feed', 'active': False, 'type': 'syndicated', 'url': 'http://ap.com' } ap_feed = db.find_or_create('feeds', ap_feed) story = { 'guid': ap_story['guid'], 'url': ap_story['url'], 'publish_date': ap_story['publish_date'], 'title': ap_story['title'], 'description': ap_story['description'], 'media_id': ap_medium['media_id'] } story = add_story(db, story, ap_feed['feeds_id']) if not story: return story_download = create_download_for_new_story(db, story, ap_feed) download_text = { 'downloads_id': story_download['downloads_id'], 'download_text': ap_story['text'], 'download_text_length': len(ap_story['text']) } db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s) """, download_text) # Send to the extractor for it to do vectorization, language detection, etc. JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=story['stories_id'], use_existing=True, )
def add_story_and_content_download(db: DatabaseHandler, story: dict, parent_download: dict) -> Optional[dict]: """If the story is new, add it to the database and also add a pending download for the story content.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) story = add_story(db=db, story=story, feeds_id=parent_download['feeds_id']) if story: if story.get('is_new', False): _create_child_download_for_story(db=db, story=story, parent_download=parent_download) return story
def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: """ Handle feeds of type 'web_page' by just creating a story to associate with the content. Web page feeds are feeds that consist of a web page that we download once a week and add as a story. """ download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) feeds_id = download['feeds_id'] feed = db.find_by_id(table='feeds', object_id=feeds_id) title = html_title(html=content, fallback='(no title)') title += '[' + sql_now() + ']' guid = f"{str(int(time.time()))}:{download['url']}"[0:1024] new_story = { 'url': download['url'], 'guid': guid, 'media_id': feed['media_id'], 'publish_date': sql_now(), 'title': title, } story = add_story(db=db, story=new_story, feeds_id=feeds_id) if not story: raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}") db.query( """ UPDATE downloads SET stories_id = %(stories_id)s, type = 'content' WHERE downloads_id = %(downloads_id)s """, { 'stories_id': story['stories_id'], 'downloads_id': download['downloads_id'], }) # A webpage that was just fetched is also a story story_ids = [ story['stories_id'], ] return story_ids
def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: """ Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed download; if there is a new URL, check whether each story is new, and if so add it to the database and add a pending download for it. Return new stories that were found in the feed. """ download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) media_id = get_media_id(db=db, download=download) download_time = download['download_time'] try: stories = self._get_stories_from_syndicated_feed( content=content, media_id=media_id, download_time=download_time, ) except Exception as ex: raise McCrawlerFetcherSoftError( f"Error processing feed for {download['url']}: {ex}") if stories_checksum_matches_feed(db=db, feeds_id=download['feeds_id'], stories=stories): return [] new_story_ids = [] for story in stories: # FIXME None of the helpers like keys they don't know about story_without_enclosures = story.copy() story_without_enclosures.pop('enclosures') if self._add_content_download_for_new_stories(): added_story = add_story_and_content_download( db=db, story=story_without_enclosures, parent_download=download, ) else: added_story = add_story( db=db, story=story_without_enclosures, feeds_id=download['feeds_id'], ) # We might have received None due to a GUID conflict if added_story: stories_id = added_story['stories_id'] story_is_new = added_story.get('is_new', False) if story_is_new: # Add all of the enclosures for enclosure in story['enclosures']: # ...provided that the URL is set if enclosure['url']: db.query( """ INSERT INTO story_enclosures (stories_id, url, mime_type, length) VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s) -- Some stories have multiple enclosures pointing to the same URL ON CONFLICT (stories_id, url) DO NOTHING """, { 'stories_id': stories_id, 'url': enclosure['url'], 'mime_type': enclosure['mime_type'], 'length': enclosure['length'], }) # Append to the list of newly added storyes new_story_ids.append(stories_id) log.info( f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}" ) return new_story_ids
def generate_story(db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: str = None, fallback_date: Optional[str] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:MAX_URL_LENGTH] medium = guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = get_spidered_tag(db) if title is None: title = html_title(content, url, MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = None if publish_date is None: date_guess = guess_date(url, content) story[ 'publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() else: story['publish_date'] = publish_date story = add_story(db, story, feed['feeds_id']) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, %(b)s where not exists ( select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s ) """, { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) if publish_date is None: assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) if story.get('is_new', False): download = create_download_for_new_story(db, story, feed) store_content(db, download, content) _extract_story(story) return story
def generate_story(db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: str = None, fallback_date: Optional[str] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") log.debug(f"Generating story from URL {url}...") url = url[0:MAX_URL_LENGTH] log.debug(f"Guessing medium for URL {url}...") medium = guess_medium(db, url) log.debug(f"Done guessing medium for URL {url}: {medium}") log.debug(f"Getting spider feed for medium {medium}...") feed = get_spider_feed(db, medium) log.debug(f"Done getting spider feed for medium {medium}: {feed}") log.debug(f"Getting spidered tag...") spidered_tag = get_spidered_tag(db) log.debug(f"Done getting spidered tag: {spidered_tag}") if title is None: log.debug(f"Parsing HTML title...") title = html_title(content, url, MAX_TITLE_LENGTH) log.debug(f"Done parsing HTML title: {title}") story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = None if publish_date is None: log.debug(f"Guessing date for URL {url}...") date_guess = guess_date(url, content) log.debug(f"Done guessing date for URL {url}: {date_guess}") story['publish_date'] = date_guess.date if date_guess.found else None else: story['publish_date'] = publish_date log.debug(f"Adding story {story}...") story = add_story(db, story, feed['feeds_id']) log.debug(f"Done adding story {story}") db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, %(b)s where not exists ( select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s ) """, { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) if publish_date is None: log.debug(f"Assigning date guess tag...") assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) if story.get('is_new', False): log.debug("Story is new, creating download...") download = create_download_for_new_story(db, story, feed) log.debug("Storing story content...") store_and_verify_content(db, download, content) log.debug("Extracting story...") _extract_story(db, story) log.debug("Done extracting story") else: log.debug("Story is not new, skipping download storage and extraction") log.debug(f"Done generating story from URL {url}") return story