def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: story = db.find_by_id(table='stories', object_id=transcript.stories_id) feed = db.query( """ SELECT * FROM feeds WHERE feeds_id = ( SELECT feeds_id FROM feeds_stories_map WHERE stories_id = %(stories_id)s ) """, { 'stories_id': transcript.stories_id, }).hash() download = create_download_for_new_story(db=db, story=story, feed=feed) text = cls._download_text_from_transcript(transcript=transcript) # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later store_content(db=db, download=download, content=text) return download['downloads_id']
def test_skip_self_links(self): """Test that self links are skipped within extract_links_for_topic_story""" story_domain = get_url_distinctive_domain(self.test_story['url']) topic = create_test_topic(self.db, 'links') self.db.create( 'topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'] }) num_links = MAX_SELF_LINKS * 2 content = '' for i in range(num_links): plain_text = "Sample sentence to make sure the links get extracted" * 10 url = "http://%s/%d" % (story_domain, i) paragraph = "<p>%s <a href='%s'>link</a></p>\n\n" % (plain_text, url) content = content + paragraph store_content(self.db, self.test_download, content) extract_links_for_topic_story(db=self.db, stories_id=self.test_story['stories_id'], topics_id=topic['topics_id']) topic_links = self.db.query( "select * from topic_links where topics_id = %(a)s", { 'a': topic['topics_id'] }).hashes() assert (len(topic_links) == MAX_SELF_LINKS)
async def fetch_store_transcript(self, stories_id: int) -> None: log.info(f"Fetching and storing transcript for story {stories_id}...") with tempfile.TemporaryDirectory( prefix='fetch_store_transcript') as temp_dir: transcript_json_path = os.path.join(temp_dir, 'transcript.json') gcs = GCSStore(bucket_config=self.config.transcripts()) gcs.download_object(object_id=str(stories_id), local_file_path=transcript_json_path) with open(transcript_json_path, 'r') as f: transcript_json = f.read() transcript = Transcript.from_dict(decode_json(transcript_json)) db = connect_to_db_or_raise() story = db.find_by_id(table='stories', object_id=stories_id) feed = db.query( """ SELECT * FROM feeds WHERE feeds_id = ( SELECT feeds_id FROM feeds_stories_map WHERE stories_id = %(stories_id)s ) """, { 'stories_id': stories_id, }).hash() # Just like create_download_for_new_story(), it creates a new download except that it tests if such a download # exists first download = db.find_or_create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'success', 'path': 'content:pending', 'priority': 1, 'extracted': 'f' }, ) text = transcript.download_text_from_transcript() # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later store_content(db=db, download=download, content=text) log.info( f"Done fetching and storing transcript for story {stories_id}")
def test_get_extracted_html(self) -> None: content = '<html><head><meta foo="bar" /></head><body>foo</body></html>' store_content(self.db, self.test_download, content) extracted_html = _get_extracted_html(self.db, self.test_story) assert extracted_html.strip( ) == '<body id="readabilityBody">foo</body>'
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.db = connect_to_db() self.test_medium = create_test_medium(self.db, 'downloads test') self.test_feed = create_test_feed(self.db, 'downloads test', self.test_medium) self.test_download_feed = create_download_for_feed(self.db, self.test_feed) self.test_story = create_test_story(self.db, label='downloads est', feed=self.test_feed) self.test_download = create_download_for_story(self.db, feed=self.test_feed, story=self.test_story) store_content(db=self.db, download=self.test_download, content=self.__TEST_CONTENT)
def test_store_content(self) -> None: """Test store_content by calling store_content and then calling fetch_content() on the postgresql store.""" amazon_s3_downloads_config = _default_amazon_s3_downloads_config() class DoNotReadAllFromS3DownloadStorageConfig(DownloadStorageConfig): @staticmethod def read_all_from_s3(): return False @staticmethod def fallback_postgresql_to_s3(): return False @staticmethod def storage_locations(): return ['postgresql'] store = _get_store_for_reading( download=self.test_download, amazon_s3_downloads_config=amazon_s3_downloads_config, download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(), ) content = 'bat baz bar foo' got_download = store_content(db=self._db, download=self.test_download, content=content) got_content = store.fetch_content( db=self._db, object_id=self.test_download['downloads_id']).decode() assert got_content == content assert got_download['state'] == 'success' assert got_download['path'] == 'postgresql:raw_downloads' assert got_download['error_message'] == '' content = 'bat baz bar' self.test_download['state'] = 'feed_error' got_download = store_content(db=self._db, download=self.test_download, content=content) got_content = store.fetch_content( db=self._db, object_id=self.test_download['downloads_id']).decode() assert got_content == content assert got_download['state'] == 'feed_error' assert got_download['path'] == 'postgresql:raw_downloads' assert not got_download['error_message'] # NULL or an empty string
def test_get_youtube_embed_links(self) -> None: youtube_html = """ <iframe src="http://youtube.com/embed/1234" /> <img src="http://foo.com/foo.png" /> <iframe src="http://youtube-embed.com/embed/3456" /> <iframe src="http://bar.com" /> """ store_content(self.db, self.test_download, youtube_html) links = _get_youtube_embed_links(self.db, self.test_story) assert links == [ 'http://youtube.com/embed/1234', 'http://youtube.com/embed/3456' ]
def setUp(self): """Create test_story and test_download.""" super().setUp() self.db = connect_to_db() media = create_test_story_stack(self.db, {'A': {'B': [1]}}) story = media['A']['feeds']['B']['stories']['1'] download = create_download_for_story( db=self.db, feed=media['A']['feeds']['B'], story=story, ) store_content(self.db, download, '<p>foo</p>') self.test_story = story self.test_download = download
def store_and_verify_content(db: DatabaseHandler, download: dict, content: str) -> None: """Call store content and then poll verifying that the content has been stored. Only return once we have verified that the content has been stored. Raise an error after a timeout if the content is not found. It seems like S3 content is not available for fetching until a small delay after writing it. This function makes sure the content is there once the store operation is done. """ store_content(db, download, content) tries = 0 while True: try: fetch_content(db, download) break except Exception as e: if tries > STORE_CONTENT_TIMEOUT: raise e log.debug(f"story_and_verify_content: waiting to retry verification ({tries}) ...") tries += 1 time.sleep(1)
def test_extract(self) -> None: """Test extract().""" html = '<script>ignore</script><p>foo</p>' store_content(self.db, self.test_download, html) result = extract(db=self.db, download=self.test_download) assert result['extracted_html'].strip( ) == '<body id="readabilityBody"><p>foo</p></body>' assert result['extracted_text'].strip() == 'foo.' store_content(self.db, self.test_download, html) extract( db=self.db, download=self.test_download, extractor_args=PyExtractorArguments(use_cache=True), ) store_content(self.db, self.test_download, 'bar') result = extract( db=self.db, download=self.test_download, extractor_args=PyExtractorArguments(use_cache=True), ) assert result['extracted_html'].strip( ) == '<body id="readabilityBody"><p>foo</p></body>' assert result['extracted_text'].strip() == 'foo.'
def test_get_links_from_story(self): """Test get_links_from_story().""" self.test_story['title'] = 'http://title.text' self.test_story['description'] = '<a href="http://description.link" />http://description.text' self.db.update_by_id('stories', self.test_story['stories_id'], self.test_story) html_content = """ <p>Here is a content <a href="http://content.1.link">link</a>.</p> <p>Here is another content <a href="http://content.2.link" />link</a>.</p> <p>Here is a duplicate content <a href="http://content.2.link" />link</a>.</p> <p>Here is a duplicate text <a href="http://link-text.dup" />link</a>.</p> <p>Here is a youtube embed:</p> <iframe src="http://youtube-embed.com/embed/123456" /> """ download_text = dict() download_text['downloads_id'] = self.test_download['downloads_id'] download_text['download_text'] = "http://text.1.link http://text.2.link http://text.2.link http://link-text.dup" download_text['download_text_length'] = len(download_text['download_text']) self.db.create('download_texts', download_text) expected_links = """ http://content.1.link http://content.2.link http://youtube.com/embed/123456 http://title.text http://description.link http://description.text http://text.1.link http://text.2.link http://link-text.dup """.split() store_content(self.db, self.test_download, html_content) links = _get_links_from_story(self.db, self.test_story) assert sorted(links) == sorted(expected_links)
def store_download(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) downloads_id = download['downloads_id'] stories_id = download['stories_id'] if not downloads_id: raise McCrawlerFetcherHardError("'downloads_id' is empty.") if not stories_id: raise McCrawlerFetcherHardError("'stories_id' is empty.") if content is None: # Content might be empty but not None raise McCrawlerFetcherHardError( f"Content for download {downloads_id}, story {stories_id} is None." ) log.info( f"Processing content download {downloads_id} (story {stories_id})..." ) if len(content) == 0: log.warning( f"Content for download {downloads_id}, story {stories_id} is empty." ) download = store_content(db=db, download=download, content=content) log.info( f"Done processing content download {downloads_id} (story {stories_id})" ) story_ids_to_extract = [ download['stories_id'], ] return story_ids_to_extract
def test_merge_foreign_rss_stories(): """Test merge_foreign_rss_stories().""" db = connect_to_db() topic = create_test_topic(db, 'foo') medium = create_test_medium(db, 'norss') feed = create_test_feed(db=db, label='norss', medium=medium) num_stories = 10 stories = [ create_test_story(db=db, label=str(i), feed=feed) for i in range(num_stories) ] rss_medium = create_test_medium(db, 'rss') rss_medium = db.query( """ UPDATE media SET foreign_rss_links = 't' WHERE media_id = %(media_id)s RETURNING * """, { 'media_id': rss_medium['media_id'], }).hash() rss_feed = create_test_feed(db=db, label='rss', medium=rss_medium) num_rss_stories = 10 rss_stories = [] for i in range(num_rss_stories): story = create_test_story(db=db, label=str(i), feed=rss_feed) download = db.create( 'downloads', { 'stories_id': story['stories_id'], 'feeds_id': rss_feed['feeds_id'], 'url': story['url'], 'host': 'foo', 'type': 'content', 'state': 'success', 'priority': 0, 'sequence': 0, 'path': 'postgresql' }) store_content(db, download, story['title']) rss_stories.append(story) # noinspection SqlInsertValues db.query( """ INSERT INTO topic_stories ( stories_id, topics_id ) SELECT stories_id, %(topics_id)s AS topics_id FROM stories """, { 'topics_id': int(topic['topics_id']), }) assert db.query("SELECT COUNT(*) FROM topic_stories").flat( )[0] == num_stories + num_rss_stories merge_foreign_rss_stories(db, topic) assert db.query( "SELECT COUNT(*) FROM topic_stories").flat()[0] == num_stories assert db.query( "SELECT COUNT(*) FROM topic_seed_urls").flat()[0] == num_rss_stories got_topic_stories_ids = db.query( "SELECT stories_id FROM topic_stories").flat() expected_topic_stories_ids = [s['stories_id'] for s in stories] assert sorted(got_topic_stories_ids) == sorted(expected_topic_stories_ids) got_seed_urls = db.query( """ SELECT topics_id, url, content FROM topic_seed_urls WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], }).hashes() expected_seed_urls = \ [{'url': s['url'], 'topics_id': topic['topics_id'], 'content': s['title']} for s in rss_stories] assert sorted(got_seed_urls, key=itemgetter('url')) == sorted(expected_seed_urls, key=itemgetter('url'))
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) content_language_code = None if 'content' in story: content = story['content'] content_language_code = language_code_for_text(content) else: content = _get_test_content() # If language code was undetermined, or if we're using Latin test content if not content_language_code: content_language_code = 'en' if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={ 'full_text_rss': False, 'language': content_language_code, }, ) host = get_url_host(feed['url']) download = db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': True, 'stories_id': story['stories_id'], }) download = store_content(db=db, download=download, content=content) extracted_content = html_strip(content) story['download'] = download story['content'] = extracted_content db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) """, { 'downloads_id': download['downloads_id'], 'download_text': extracted_content, }) lang = LanguageFactory.language_for_code(content_language_code) assert lang, f"Language is None for code {content_language_code}" sentences = lang.split_text_to_sentences(extracted_content) sentence_number = 1 for sentence in sentences: db.insert(table='story_sentences', insert_hash={ 'sentence': sentence, 'language': language_code_for_text(sentence) or 'en', 'sentence_number': sentence_number, 'stories_id': story['stories_id'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }) sentence_number += 1 mark_as_processed(db=db, stories_id=story['stories_id']) story['download_text'] = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'] }).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, { 'a': story['stories_id'], 'b': old_story['stories_id'] }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'] }) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': old_story['stories_id'] }).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([ (f, old_download[f]) for f in ['state', 'error_message', 'download_time'] ]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) # noinspection SqlInsertValues db.query( f""" insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'b': old_story['stories_id']}) return story
def generate_story(db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: str = None, fallback_date: Optional[str] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:MAX_URL_LENGTH] medium = guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = get_spidered_tag(db) if title is None: title = html_title(content, url, MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = None if publish_date is None: date_guess = guess_date(url, content) story[ 'publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() else: story['publish_date'] = publish_date story = add_story(db, story, feed['feeds_id']) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, %(b)s where not exists ( select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s ) """, { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) if publish_date is None: assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) if story.get('is_new', False): download = create_download_for_new_story(db, story, feed) store_content(db, download, content) _extract_story(story) return story
def store_download(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) downloads_id = download['downloads_id'] log.info(f"Processing feed download {downloads_id}...") try: added_story_ids = self.add_stories_from_feed(db=db, download=download, content=content) story_ids_to_extract = self.add_stories_from_feed( db=db, download=download, content=content) except Exception as ex: error_message = f"Error processing feed for download {downloads_id}: {ex}" log.error(error_message) db.query( """ UPDATE downloads SET state = 'feed_error', error_message = %(error_message)s WHERE downloads_id = %(downloads_id)s """, { 'error_message': error_message, 'downloads_id': downloads_id, }) # On non-soft errors (explicitly hard errors or unknown errors), pass the exception up if not isinstance(ex, McCrawlerFetcherSoftError): raise ex story_ids_to_extract = [] else: if len(added_story_ids): last_new_story_time_sql = 'last_new_story_time = last_attempted_download_time, ' else: last_new_story_time_sql = '' db.query( f""" UPDATE feeds SET {last_new_story_time_sql} last_successful_download_time = GREATEST(last_successful_download_time, %(download_time)s) WHERE feeds_id = %(feeds_id)s """, { 'download_time': download['download_time'], 'feeds_id': download['feeds_id'], }) # If no new stories, just store "(redundant feed)" to save storage space if len(added_story_ids) == 0: content = '(redundant feed)' # Reread the possibly updated download download = db.find_by_id(table='downloads', object_id=downloads_id) # Store the feed in any case store_content(db=db, download=download, content=content) log.info(f"Done processing feed download {downloads_id}") return story_ids_to_extract
def test_merge_foreign_rss_stories(): """Test merge_foreign_rss_stories().""" db = connect_to_db() topic = create_test_topic(db, 'foo') medium = create_test_medium(db, 'norss') feed = create_test_feed(db=db, label='norss', medium=medium) num_stories = 10 stories = [ create_test_story(db=db, label=str(i), feed=feed) for i in range(num_stories) ] rss_medium = create_test_medium(db, 'rss') rss_medium = db.query( "update media set foreign_rss_links = 't' where media_id = %(a)s returning *", {'a': rss_medium['media_id']}).hash() rss_feed = create_test_feed(db=db, label='rss', medium=rss_medium) num_rss_stories = 10 rss_stories = [] for i in range(num_rss_stories): story = create_test_story(db=db, label=str(i), feed=rss_feed) download = db.create('downloads', { 'stories_id': story['stories_id'], 'feeds_id': rss_feed['feeds_id'], 'url': story['url'], 'host': 'foo', 'type': 'content', 'state': 'success', 'priority': 0, 'sequence': 0, 'path': 'postgresql'}) store_content(db, download, story['title']) rss_stories.append(story) # noinspection SqlInsertValues db.query( f""" insert into topic_stories (stories_id, topics_id) select s.stories_id, {int(topic['topics_id'])} from stories s """ ) assert db.query("select count(*) from topic_stories").flat()[0] == num_stories + num_rss_stories merge_foreign_rss_stories(db, topic) assert db.query("select count(*) from topic_stories").flat()[0] == num_stories assert db.query("select count(*) from topic_seed_urls").flat()[0] == num_rss_stories got_topic_stories_ids = db.query("select stories_id from topic_stories").flat() expected_topic_stories_ids = [s['stories_id'] for s in stories] assert sorted(got_topic_stories_ids) == sorted(expected_topic_stories_ids) got_seed_urls = db.query( "select topics_id, url, content from topic_seed_urls where topics_id = %(a)s", {'a': topic['topics_id']}).hashes() expected_seed_urls = \ [{'url': s['url'], 'topics_id': topic['topics_id'], 'content': s['title']} for s in rss_stories] assert sorted(got_seed_urls, key=itemgetter('url')) == sorted(expected_seed_urls, key=itemgetter('url'))
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = db.create( 'stories', { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] }, ) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) for old_story_tag in db.query( """ SELECT tags_id FROM stories_tags_map WHERE stories_id = %(stories_id)s ORDER BY tags_id """, {'stories_id': old_story['stories_id']}, ).hashes(): stories_id = story['stories_id'] tags_id = old_story_tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']}) old_download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY downloads_id LIMIT 1 """, { 'stories_id': old_story['stories_id'], } ).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ INSERT INTO download_texts ( downloads_id, download_text, download_text_length ) SELECT %(downloads_id)s, dt.download_text, dt.download_text_length FROM download_texts AS dt WHERE dt.downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'], }, ) # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date, language ) SELECT %(new_stories_id)s, sentence_number, sentence, media_id, publish_date, language FROM story_sentences WHERE stories_id = %(old_stories_id)s """, { 'old_stories_id': old_story['stories_id'], 'new_stories_id': int(story['stories_id']), }, ) return story