def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: """ Handle feeds of type 'web_page' by just creating a story to associate with the content. Web page feeds are feeds that consist of a web page that we download once a week and add as a story. """ download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) feeds_id = download['feeds_id'] feed = db.find_by_id(table='feeds', object_id=feeds_id) title = html_title(html=content, fallback='(no title)') title += '[' + sql_now() + ']' guid = f"{str(int(time.time()))}:{download['url']}"[0:1024] new_story = { 'url': download['url'], 'guid': guid, 'media_id': feed['media_id'], 'publish_date': sql_now(), 'title': title, } story = add_story(db=db, story=new_story, feeds_id=feeds_id) if not story: raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}") db.query( """ UPDATE downloads SET stories_id = %(stories_id)s, type = 'content' WHERE downloads_id = %(downloads_id)s """, { 'stories_id': story['stories_id'], 'downloads_id': download['downloads_id'], }) # A webpage that was just fetched is also a story story_ids = [ story['stories_id'], ] return story_ids
def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' try: db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) except McTupleAlreadyMovedError as ex: # Some attempts to set the download's row to "fetching" fail with: # # "tuple to be locked was already moved to another partition due to concurrent update" # # If that happens, we assume that some other fetcher instance somehow got to the download first and do # nothing log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}") return None except Exception as ex: # Raise further on misc. errors raise ex ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = f"{user['name']} ({user['screen_name']}): {user['description']}" title = f"{user['name']} ({user['screen_name']}) | Twitter" tweet_date = sql_now() url = f"https://twitter.com/{user['screen_name']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) stories_id = story['stories_id'] tags_id = undateable_tag['tags_id'] db.query(""" INSERT INTO public.stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) return story
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, 'a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % ( sql_now(), url, )) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % ( http_request_log_path, str(ex), )) pass
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") log.debug("HTTP request: %s %s\n" % (sql_now(), url,))
def fetch_download(self, db: DatabaseHandler, download: dict) -> Response: download = decode_object_from_bytes_if_needed(download) download['download_time'] = sql_now() download['state'] = 'fetching' db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) ua = UserAgent() url_with_credentials = self._api_request_url_with_signature_from_config(api_url=download['url']) request = Request(method='GET', url=url_with_credentials) response = ua.request(request) return response
def update_job_state_message(self, db: DatabaseHandler, message: str) -> None: """ Update the message field for the current "job_states" row. This is a public method that is intended to be used by code run anywhere above the stack from run() to publish messages updating the progress of a long running job. """ message = decode_object_from_bytes_if_needed(message) # Verify that it exists I guess? db.require_by_id(table='job_states', object_id=self.__job_states_id) job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'message': message, 'last_updated': sql_now(), }) self.__update_table_state(db=db, job_state=job_state)
def fetch_download(self, db: DatabaseHandler, download: dict) -> Response: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, encoding='utf-8', mode='a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % (sql_now(), url,)) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),)) pass
def update_job_state(self, db: DatabaseHandler, state: str, message: Optional[str] = ''): """ Update the state and message fields of the "job_states" table for the currently active "job_states_id". "jobs_states_id" is set and unset in method run() below, so this must be called from code running from within the run() implementation of the subclass. """ state = decode_object_from_bytes_if_needed(state) message = decode_object_from_bytes_if_needed(message) log.debug(f"{self.__queue_name} state: {state}") job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'state': state, 'last_updated': sql_now(), 'message': message, }) self.__update_table_state(db=db, job_state=job_state)
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description']) title = '%s (%s) | Twitter' % (user['name'], user['screen_name']) tweet_date = sql_now() url = 'https://twitter.com/%s' % user['screen_name'] story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': undateable_tag['tags_id']}) return story
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = db.create( 'stories', { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] }, ) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) for old_story_tag in db.query( """ SELECT tags_id FROM stories_tags_map WHERE stories_id = %(stories_id)s ORDER BY tags_id """, {'stories_id': old_story['stories_id']}, ).hashes(): stories_id = story['stories_id'] tags_id = old_story_tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']}) old_download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY downloads_id LIMIT 1 """, { 'stories_id': old_story['stories_id'], } ).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ INSERT INTO download_texts ( downloads_id, download_text, download_text_length ) SELECT %(downloads_id)s, dt.download_text, dt.download_text_length FROM download_texts AS dt WHERE dt.downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'], }, ) # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date, language ) SELECT %(new_stories_id)s, sentence_number, sentence, media_id, publish_date, language FROM story_sentences WHERE stories_id = %(old_stories_id)s """, { 'old_stories_id': old_story['stories_id'], 'new_stories_id': int(story['stories_id']), }, ) return story
def test_cliff_tagger(): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) store = CLIFFAnnotatorStore() store.store_annotation_for_story(db=db, stories_id=stories_id, annotation=sample_cliff_response()) cliff = CLIFFTagger() cliff.update_tags_for_story(db=db, stories_id=stories_id) story_tags = db.query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY lower(tag_sets.name), lower(tags.tag) """, { 'stories_id': stories_id }).hashes() expected_tags = expected_cliff_tags() assert story_tags == expected_tags
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, { 'a': story['stories_id'], 'b': old_story['stories_id'] }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'] }) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': old_story['stories_id'] }).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([ (f, old_download[f]) for f in ['state', 'error_message', 'download_time'] ]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) # noinspection SqlInsertValues db.query( f""" insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'b': old_story['stories_id']}) return story
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response( _: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query( """ SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, { 'object_id': stories_id }).hash() assert annotation_exists is not None story_tags = self.db().query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, { 'stories_id': stories_id }).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags
def test_add_stale_feeds(): db = connect_to_db() medium = create_test_medium(db, 'foo') pending_feeds = [] feed = { 'media_id': medium['media_id'], 'name': 'null last download', 'url': 'http://null last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': None } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last download', 'url': 'http://recent last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now() } db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last new story', 'url': 'http://recent last new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now(), 'last_new_story_time': sql_now() } db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': '5 minute new story', 'url': 'http://5 minute new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(int(time.time()) - 300), 'last_new_story_time': get_sql_date_from_epoch(int(time.time()) - 300), } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'old last download', 'url': 'http://old last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(int(time.time()) - (86400 * 10)) } feed = db.create('feeds', feed) pending_feeds.append(feed) _add_stale_feeds(db) num_pending_downloads = db.query( "select count(*) from downloads where state = 'pending'").flat()[0] assert num_pending_downloads == len(pending_feeds) for feed in pending_feeds: exists = db.query( "select * from downloads where state = 'pending' and feeds_id = %(a)s", { 'a': feed['feeds_id'] }).hash() assert exists, "download for feed %s added" % feed['name']
def test_cliff_annotator(): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __cliff_sample_response(_: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(sample_cliff_response()) return response pages = { '/cliff/parse/text': { 'callback': __cliff_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/cliff/parse/text' % port hs = HashServer(port=port, pages=pages) hs.start() class TestCLIFFFetcherConfig(CLIFFFetcherConfig): @staticmethod def annotator_url() -> str: return annotator_url cliff = CLIFFAnnotatorFetcher(fetcher_config=TestCLIFFFetcherConfig()) cliff.annotate_and_store_for_story(db=db, stories_id=stories_id) hs.stop() annotation_exists = db.query(""" SELECT 1 FROM cliff_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None
def test_sql_now(): assert sql_now() == datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
def test_add_stale_feeds(self) -> None: """Test _add_stale_feeds().""" db = self.db() medium = mediawords.test.db.create.create_test_medium(db, 'foo') pending_feeds = [] feed = { 'media_id': medium['media_id'], 'name': 'null last download', 'url': 'http://null last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': None } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last download', 'url': 'http://recent last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now() } feed = db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': 'recent last new story', 'url': 'http://recent last new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': sql_now(), 'last_new_story_time': sql_now() } feed = db.create('feeds', feed) feed = { 'media_id': medium['media_id'], 'name': '5 minute new story', 'url': 'http://5 minute new story', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(time.time() - 300), 'last_new_story_time': get_sql_date_from_epoch(time.time() - 300), } feed = db.create('feeds', feed) pending_feeds.append(feed) feed = { 'media_id': medium['media_id'], 'name': 'old last download', 'url': 'http://old last download', 'type': 'syndicated', 'active': True, 'last_attempted_download_time': get_sql_date_from_epoch(time.time() - (86400 * 10)) } feed = db.create('feeds', feed) pending_feeds.append(feed) mediawords.crawler.provider._add_stale_feeds(db) num_pending_downloads = db.query("select count(*) from downloads where state = 'pending'").flat()[0] assert num_pending_downloads == len(pending_feeds) for feed in pending_feeds: exists = db.query( "select * from downloads where state = 'pending' and feeds_id = %(a)s", {'a': feed['feeds_id']}).hash() assert exists, "download for feed %s added" % feed['name']
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query(""" SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None story_tags = self.db().query(""" SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, {'stories_id': stories_id}).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags
def test_tagging(self): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __cliff_sample_response( _: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(sample_cliff_response()) return response pages = { '/cliff/parse/text': { 'callback': __cliff_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/cliff/parse/text' % port hs = HashServer(port=port, pages=pages) hs.start() class TestCLIFFFetcherConfig(CLIFFTagsFromAnnotationConfig): @staticmethod def annotator_url() -> str: return annotator_url cliff = CLIFFTagsFromAnnotation(tagger_config=TestCLIFFFetcherConfig()) cliff.update_tags_for_story(db=db, stories_id=stories_id) hs.stop() story_tags = db.query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY lower(tag_sets.name), lower(tags.tag) """, { 'stories_id': stories_id }).hashes() expected_tags = expected_cliff_tags() assert story_tags == expected_tags
def test_nyt_labels_annotator(self): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) store = NYTLabelsAnnotatorStore() store.store_annotation_for_story(db=db, stories_id=stories_id, annotation=sample_nytlabels_response()) nytlabels = NYTLabelsTagger() nytlabels.update_tags_for_story(db=db, stories_id=stories_id) annotation_exists = db.query(""" SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None story_tags = db.query(""" SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, {'stories_id': stories_id}).hashes() expected_tags = expected_nytlabels_tags() assert story_tags == expected_tags
def test_sql_now(): assert sql_now() == datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')