def store_content(db: DatabaseHandler, download: dict, content: str) -> dict: """Store the content for the download.""" # feed_error state indicates that the download was successful but that there was a problem # parsing the feed afterward. so we want to keep the feed_error state even if we redownload # the content download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) new_state = 'success' if download['state'] != 'feed_error' else 'feed_error' try: path = _get_store_for_writing().store_content(db, download['downloads_id'], content) except Exception as ex: raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['downloads_id'], ex)) if new_state == 'success': download['error_message'] = '' db.update_by_id( table='downloads', object_id=download['downloads_id'], update_hash={'state': new_state, 'path': path, 'error_message': download['error_message']}, ) download = db.find_by_id('downloads', download['downloads_id']) return download
def __update_table_state(self, db: DatabaseHandler, job_state: Dict[str, Any]) -> None: """ Update the state and message fields in the given table for the row whose '<table>_id' field matches that field in the job args. """ job_state = decode_object_from_bytes_if_needed(job_state) try: args = decode_json(job_state.get('args', '')) except Exception as ex: log.error(f"Unable to decode args from job state {job_state}: {ex}") return extra_table = self.__state_config.extra_table() if extra_table: id_field = extra_table.table_name() + '_id' id_value = args.get(id_field, None) if not id_value: # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance # SnapshotTopic needs to create the snapshot. log.warning(f"Unable to get ID value for field '{id_field}' from job state {job_state}") return None update = { extra_table.state_column(): job_state.get('state', None), extra_table.message_column(): job_state.get('message', None), } db.update_by_id(table=extra_table.table_name(), object_id=id_value, update_hash=update) else: log.debug("Extra table for storing state is not configured.")
def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None: """Update the args field for the current "job_states" row.""" args = decode_object_from_bytes_if_needed(args) job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id) try: # job_states.args got changed from JSON to JSONB while sharding the # database, and there's no way to disable decoding JSONB (as # opposed to JSON) in psycopg2, so "args" might be a JSON string or # a pre-decoded dictionary maybe_json_db_args = job_state.get('args', '') if isinstance(maybe_json_db_args, dict): db_args = maybe_json_db_args else: db_args = decode_json(maybe_json_db_args) except Exception as ex: log.error( f"Unable to decode args from job state {job_state}: {ex}") db_args = {} db_args = {**db_args, **args} args_json = encode_json(db_args) db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'args': args_json, })
def validate_remote_integration(db: DatabaseHandler, source: str, query: str, day: str) -> None: """Run sanity test on remote APIs.""" topic = create_test_topic(db, "test_remote_integration") tsq = { 'topics_id': topic['topics_id'], 'platform': 'twitter', 'source': source, 'query': query } db.create('topic_seed_queries', tsq) topic['platform'] = 'twitter' topic['pattern'] = '.*' topic['start_date'] = day topic['end_date'] = day topic['mode'] = 'url_sharing' db.update_by_id('topics', topic['topics_id'], topic) fetch_topic_posts(db, topic['topics_id']) got_tts = db.query("select * from topic_posts").hashes() # for old ch monitors, lots of the posts may be deleted assert len(got_tts) > 20 assert len(got_tts[0]['content']) > MIN_TEST_POST_LENGTH assert len(got_tts[0]['author']) > MIN_TEST_AUTHOR_LENGTH
def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' try: db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) except McTupleAlreadyMovedError as ex: # Some attempts to set the download's row to "fetching" fail with: # # "tuple to be locked was already moved to another partition due to concurrent update" # # If that happens, we assume that some other fetcher instance somehow got to the download first and do # nothing log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}") return None except Exception as ex: # Raise further on misc. errors raise ex ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def _update_tfu_message(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> None: """Update the topic_fetch_url.message field in the database.""" if _USE_TFU_DEBUG_MESSAGES: db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], {'message': message})
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict: """Store the content for the download.""" # feed_error state indicates that the download was successfull but that there was a problem # parsing the feed afterward. so we want to keep the feed_error state even if we redownload # the content new_state = 'success' if download['state'] != 'feed_error' else 'feed_error' try: path = _get_store_for_writing().store_content(db, download['downloads_id'], content) error = '' except Exception as e: raise McDBIDownloadsException( "error while trying to store download %d: %s" % (download['downloads_id'], e)) new_state = 'error' error = str(e) path = '' if new_state == 'success': error = '' db.update_by_id('downloads', download['downloads_id'], { 'state': new_state, 'path': path, 'error_message': error }) download = db.find_by_id('downloads', download['downloads_id']) return download
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None: """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern. Update the following fields in the topic_fetch_urls row: code - the status code of the http response fetch_date - the current time state - one of the FETCH_STATE_* constatnts message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED) stories_id - the id of the story generated from the fetched content, or null if no story created' If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting topic_fetch_urls.stories_id to topic_links.ref_stories_id. If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything. If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of fetching the url. This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of FETCH_STATE_PYTHON_ERROR Arguments: db - db handle topic_fetch_urls_id - id of topic_fetch_urls row domain_timeout - pass through to fech_link Returns: None """ topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id) try: log.info("fetch_link: %s" % topic_fetch_url['url']) _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout) if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None: story = db.require_by_id('stories', topic_fetch_url['stories_id']) topic = db.require_by_id('topics', topic_fetch_url['topics_id']) redirect_url = topic_fetch_url['url'] assume_match = topic_fetch_url['assume_match'] if _is_not_topic_story(db, topic_fetch_url): if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match): _add_to_topic_stories(db, story, topic) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) except McThrottledDomainException as ex: raise ex except Exception as ex: log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex)) topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR topic_fetch_url['message'] = traceback.format_exc() log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message'])) db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
def fetch_download(self, db: DatabaseHandler, download: dict) -> Response: download = decode_object_from_bytes_if_needed(download) download['download_time'] = sql_now() download['state'] = 'fetching' db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) ua = UserAgent() url_with_credentials = self._api_request_url_with_signature_from_config(api_url=download['url']) request = Request(method='GET', url=url_with_credentials) response = ua.request(request) return response
def __update_table_state(self, db: DatabaseHandler, job_state: Dict[str, Any]) -> None: """ Update the state and message fields in the given table for the row whose '<table>_id' field matches that field in the job args. """ job_state = decode_object_from_bytes_if_needed(job_state) try: # job_states.args got changed from JSON to JSONB while sharding the # database, and there's no way to disable decoding JSONB (as # opposed to JSON) in psycopg2, so "args" might be a JSON string or # a pre-decoded dictionary maybe_json_args = job_state.get('args', '') if isinstance(maybe_json_args, dict): args = maybe_json_args else: args = decode_json(maybe_json_args) except Exception as ex: log.error( f"Unable to decode args from job state {job_state}: {ex}") return extra_table = self.__state_config.extra_table() if extra_table: id_field = extra_table.table_name() + '_id' id_value = args.get(id_field, None) if not id_value: # Sometimes there is not a relevant <table>_id until some of the code in run() has run, for instance # SnapshotTopic needs to create the snapshot. log.warning( f"Unable to get ID value for field '{id_field}' from job state {job_state}" ) return None update = { extra_table.state_column(): job_state.get('state', None), extra_table.message_column(): job_state.get('message', None), } db.update_by_id(table=extra_table.table_name(), object_id=id_value, update_hash=update) else: log.debug("Extra table for storing state is not configured.")
def store_content( db: DatabaseHandler, download: dict, content: str, amazon_s3_downloads_config: AmazonS3DownloadsConfig = None, download_storage_config: DownloadStorageConfig = None, ) -> dict: """Store the content for the download.""" # feed_error state indicates that the download was successful but that there was a problem # parsing the feed afterward. so we want to keep the feed_error state even if we redownload # the content download = decode_object_from_bytes_if_needed(download) content = decode_object_from_bytes_if_needed(content) if not amazon_s3_downloads_config: amazon_s3_downloads_config = _default_amazon_s3_downloads_config() if not download_storage_config: download_storage_config = _default_download_storage_config() new_state = 'success' if download['state'] != 'feed_error' else 'feed_error' try: store = _get_store_for_writing( amazon_s3_downloads_config=amazon_s3_downloads_config, download_storage_config=download_storage_config, ) path = store.store_content(db, download['downloads_id'], content) except Exception as ex: raise McDBIDownloadsException( "error while trying to store download %d: %s" % (download['downloads_id'], ex)) if new_state == 'success': download['error_message'] = '' db.update_by_id( table='downloads', object_id=download['downloads_id'], update_hash={ 'state': new_state, 'path': path, 'error_message': download['error_message'] }, ) download = db.find_by_id('downloads', download['downloads_id']) return download
def _update_media_normalized_urls(db: DatabaseHandler) -> None: """Keep normalized_url field in media table up to date. Set the normalized_url field of any row in media for which it is null. Take care to lock the process so that only one process is doing this work at a time. """ # put a lock on this because the process of generating all media urls will take a couple hours, and we don't # want all workers to do the work locked = False while not locked: if not _normalized_urls_out_of_date(db): return db.begin() # poll instead of block so that we can releae the transaction and see whether someone else has already # updated all of the media locked = get_session_lock( db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False) if not locked: db.commit() log.info("sleeping for media_normalized_urls lock...") time.sleep(1) log.warning("updating media_normalized_urls ...") media = db.query( "select * from media where normalized_url is null").hashes() i = 0 total = len(media) for medium in media: i += 1 normalized_url = mediawords.util.url.normalize_url_lossy(medium['url']) if normalized_url is None: normalized_url = medium['url'] log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url)) db.update_by_id('media', medium['media_id'], {'normalized_url': normalized_url}) db.commit()
def fetch_download(self, db: DatabaseHandler, download: dict) -> Response: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def try_update_topic_link_ref_stories_id(db: DatabaseHandler, topic_fetch_url: dict) -> None: """Update the given topic link to point to the given ref_stories_id. Use the topic_fetch_url['topic_links_id'] as the id of the topic link to update and the topic_fetch_url['stories_id'] as the ref_stories_id. There is a unique constraint on topic_links(topics_id, stories_id, ref_stories_id). This function just does the update to topic_links and catches and ignores any errors from that constraint. Trying and failing on the constraint is faster and more reliable than checking before trying (and still maybe failing on the constraint). """ try: db.update_by_id('topic_links', topic_fetch_url['topic_links_id'], {'ref_stories_id': topic_fetch_url['stories_id']}) except mediawords.db.exceptions.handler.McUpdateByIDException as e: # the query will throw a unique constraint error if stories_id,ref_stories already exists. it's quicker # to just catch and ignore the error than to try to avoid id if 'unique constraint "topic_links_scr"' not in str(e): raise e
def _store_map(db: DatabaseHandler, topics_id: int, timespans_id: int, content: bytes, graph_format: str, color_by: str) -> None: """Create a timespans_map row.""" db.begin() options = {'color_by': color_by} options_json = encode_json(options) db.query( """ DELETE FROM timespan_maps WHERE timespans_id = %(a)s AND format = %(b)s AND options = %(c)s """, {'a': timespans_id, 'b': graph_format, 'c': options_json} ) timespan_map = { 'topics_id': topics_id, 'timespans_id': timespans_id, 'options': options_json, 'format': graph_format } timespan_map = db.create('timespan_maps', timespan_map) db.commit() content_types = { 'svg': 'image/svg+xml', 'gexf': 'xml/gexf' } content_type = content_types[graph_format] store_content(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'], content, content_type) url = get_content_url(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id']) db.update_by_id('timespan_maps', timespan_map['timespan_maps_id'], {'url': url})
def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None: """Update the args field for the current "job_states" row.""" args = decode_object_from_bytes_if_needed(args) job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id) try: db_args = decode_json(job_state.get('args', '{}')) except Exception as ex: log.error(f"Unable to decode args from job state {job_state}: {ex}") db_args = {} db_args = {**db_args, **args} args_json = encode_json(db_args) db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'args': args_json, })
def _update_media_normalized_urls(db: DatabaseHandler) -> None: """Keep normalized_url field in media table up to date. Set the normalized_url field of any row in media for which it is null. Take care to lock the process so that only one process is doing this work at a time. """ # put a lock on this because the process of generating all media urls will take a couple hours, and we don't # want all workers to do the work locked = False while not locked: if not _normalized_urls_out_of_date(db): return db.begin() # poll instead of block so that we can releae the transaction and see whether someone else has already # updated all of the media locked = get_session_lock(db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False) if not locked: db.commit() log.info("sleeping for media_normalized_urls lock...") time.sleep(1) log.warning("updating media_normalized_urls ...") media = db.query("select * from media where normalized_url is null").hashes() i = 0 total = len(media) for medium in media: i += 1 normalized_url = mediawords.util.url.normalize_url_lossy(medium['url']) if normalized_url is None: normalized_url = medium['url'] log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url)) db.update_by_id('media', medium['media_id'], {'normalized_url': normalized_url}) db.commit()
def fetch_topic_url_update_state(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: Optional[int] = None) -> bool: """Tries fetch_topic_url() and updates state. Returns True if job completed and does not have to be requeued. Returns False if job was throttled and has to be requeued. Raises exception on other errors (after updating state). """ try: fetch_topic_url(db=db, topic_fetch_urls_id=topic_fetch_urls_id, domain_timeout=domain_timeout) return True except McThrottledDomainException: # if a domain has been throttled, just add it back to the end of the queue log.info( "Fetch for topic_fetch_url %d domain throttled. Requeueing ..." % topic_fetch_urls_id) db.update_by_id('topic_fetch_urls', topic_fetch_urls_id, { 'state': FETCH_STATE_REQUEUED, 'fetch_date': datetime.datetime.now() }) return False except Exception as ex: # all non throttled errors should get caught by the try: about, but catch again here just in case log.error("Error while fetching URL with ID {}: {}".format( topic_fetch_urls_id, str(ex))) update = { 'state': FETCH_STATE_PYTHON_ERROR, 'fetch_date': datetime.datetime.now(), 'message': traceback.format_exc(), } db.update_by_id('topic_fetch_urls', topic_fetch_urls_id, update) raise ex
def store_content(db: DatabaseHandler, download: dict, content: str) -> dict: """Store the content for the download.""" new_state = 'success' if download['state'] == 'feed_error': new_state = download['state'] path = '' error = '' try: path = _get_store_for_writing().store_content(db, download['downloads_id'], content) except Exception as e: raise McDBIDownloadsException("error while trying to store download %d: %s" % (download['download_id'], e)) new_state = 'error' error = str(e) if new_state == 'success': error = '' db.update_by_id('downloads', download['downloads_id'], {'state': new_state, 'path': path, 'error_message': error}) download = db.find_by_id('downloads', download['downloads_id']) return download
def try_update_topic_link_ref_stories_id(db: DatabaseHandler, topic_fetch_url: dict) -> None: """Update the given topic link to point to the given ref_stories_id. Use the topic_fetch_url['topic_links_id'] as the id of the topic link to update and the topic_fetch_url['stories_id'] as the ref_stories_id. There is a unique constraint on topic_links(topics_id, stories_id, ref_stories_id). This function just does the update to topic_links and catches and ignores any errors from that constraint. Trying and failing on the constraint is faster and more reliable than checking before trying (and still maybe failing on the constraint). """ if topic_fetch_url.get('topic_links_id', None) is None: return try: db.update_by_id( 'topic_links', topic_fetch_url['topic_links_id'], {'ref_stories_id': topic_fetch_url['stories_id']}) except McUpdateByIDException as e: # the query will throw a unique constraint error if stories_id,ref_stories already exists. it's quicker # to just catch and ignore the error than to try to avoid id if 'unique constraint "topic_links_scr"' not in str(e): raise e
def _log_download_error(db: DatabaseHandler, download: Optional[dict], error_message: str) -> None: if not download: log.warning( f"Error while getting download from queue: {error_message}") return log.warning( f"Error while fetching download {download['downloads_id']}: {error_message}" ) if download['state'] not in {'fetching', 'queued'}: downloads_id = download['downloads_id'] download['state'] = 'error' download['error_message'] = error_message try: db.update_by_id(table='downloads', object_id=downloads_id, update_hash=download) except Exception as ex: # If we can't log the error in the database, that's really bad so a hard exception raise McCrawlerFetcherHardError(( f"Unable to log download error for download {downloads_id} in the database; " f"download error: {error_message}; database error: {ex}"))
def update_job_state_message(self, db: DatabaseHandler, message: str) -> None: """ Update the message field for the current "job_states" row. This is a public method that is intended to be used by code run anywhere above the stack from run() to publish messages updating the progress of a long running job. """ message = decode_object_from_bytes_if_needed(message) # Verify that it exists I guess? db.require_by_id(table='job_states', object_id=self.__job_states_id) job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'message': message, 'last_updated': sql_now(), }) self.__update_table_state(db=db, job_state=job_state)
def update_job_state(self, db: DatabaseHandler, state: str, message: Optional[str] = ''): """ Update the state and message fields of the "job_states" table for the currently active "job_states_id". "jobs_states_id" is set and unset in method run() below, so this must be called from code running from within the run() implementation of the subclass. """ state = decode_object_from_bytes_if_needed(state) message = decode_object_from_bytes_if_needed(message) log.debug(f"{self.__queue_name} state: {state}") job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={ 'state': state, 'last_updated': sql_now(), 'message': message, }) self.__update_table_state(db=db, job_state=job_state)
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = db.create( 'stories', { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] }, ) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) for old_story_tag in db.query( """ SELECT tags_id FROM stories_tags_map WHERE stories_id = %(stories_id)s ORDER BY tags_id """, {'stories_id': old_story['stories_id']}, ).hashes(): stories_id = story['stories_id'] tags_id = old_story_tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']}) old_download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY downloads_id LIMIT 1 """, { 'stories_id': old_story['stories_id'], } ).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ INSERT INTO download_texts ( downloads_id, download_text, download_text_length ) SELECT %(downloads_id)s, dt.download_text, dt.download_text_length FROM download_texts AS dt WHERE dt.downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'], }, ) # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date, language ) SELECT %(new_stories_id)s, sentence_number, sentence, media_id, publish_date, language FROM story_sentences WHERE stories_id = %(old_stories_id)s """, { 'old_stories_id': old_story['stories_id'], 'new_stories_id': int(story['stories_id']), }, ) return story
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': mediawords.util.sql.sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, {'a': story['stories_id'], 'b': old_story['stories_id']}) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']}) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", {'a': old_story['stories_id']}).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = mediawords.dbi.downloads.fetch_content(db, old_download) download = mediawords.dbi.downloads.store_content(db, download, content) except (mediawords.dbi.downloads.McDBIDownloadsException, mediawords.key_value_store.amazon_s3.McAmazonS3StoreException): download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) db.query( """ insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select %(a)s, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'a': story['stories_id'], 'b': old_story['stories_id']}) return story
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, { 'a': story['stories_id'], 'b': old_story['stories_id'] }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'] }) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': old_story['stories_id'] }).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([ (f, old_download[f]) for f in ['state', 'error_message', 'download_time'] ]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) # noinspection SqlInsertValues db.query( f""" insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'b': old_story['stories_id']}) return story
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None: """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern. Update the following fields in the topic_fetch_urls row: code - the status code of the http response fetch_date - the current time state - one of the FETCH_STATE_* constatnts message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED) stories_id - the id of the story generated from the fetched content, or null if no story created' If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting topic_fetch_urls.stories_id to topic_links.ref_stories_id. If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything. If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of fetching the url. This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of FETCH_STATE_PYTHON_ERROR Arguments: db - db handle topic_fetch_urls_id - id of topic_fetch_urls row domain_timeout - pass through to fech_link Returns: None """ topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id) try: log.info("fetch_link: %s" % topic_fetch_url['url']) _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None: story = db.require_by_id('stories', topic_fetch_url['stories_id']) topic = db.require_by_id('topics', topic_fetch_url['topics_id']) redirect_url = topic_fetch_url['url'] assume_match = topic_fetch_url['assume_match'] if _is_not_topic_story(db, topic_fetch_url): if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match): mediawords.tm.stories.add_to_topic_stories(db, story, topic) if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']: try_update_topic_link_ref_stories_id(db, topic_fetch_url) except McThrottledDomainException as ex: raise ex except Exception as ex: log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex)) topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR topic_fetch_url['message'] = traceback.format_exc() log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message'])) db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
def _update_tfu_message(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> None: """Update the topic_fetch_url.message field in the database.""" if _USE_TFU_DEBUG_MESSAGES: db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], {'message': message})