def ungzipped_response_content(response: Response) -> str: """Return HTTP response's decoded content, gunzip it if neccessary.""" if __response_is_gzipped_data(response): gzipped_data = response.raw_data() try: data = gunzip(gzipped_data).decode('utf-8', errors='replace') except McGunzipException as ex: log.error("Unable to gunzip response {}: {}".format(response, ex)) data = response.decoded_content() else: data = response.decoded_content() assert isinstance(data, str) return data
def store_response(self, db: DatabaseHandler, download: dict, response: Response) -> None: download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] download_url = download['url'] log.info(f"Handling download {downloads_id}...") log.debug( f"(URL of download {downloads_id} which is about to be handled: {download_url})" ) if not response.is_success(): log.info( f"Download {downloads_id} errored: {response.decoded_content()}" ) self._store_failed_download_error_message(db=db, download=download, response=response) return supported_content_types_regex = re.compile( r'text|html|xml|rss|atom|application/json', flags=re.IGNORECASE) if re.search(supported_content_types_regex, response.content_type() or ''): content = response.decoded_content() else: content = '(unsupported content type)' db.query( """ UPDATE downloads SET url = %(download_url)s WHERE downloads_id = %(downloads_id)s AND url != %(download_url)s """, { 'downloads_id': downloads_id, 'download_url': download_url, }) story_ids_to_extract = self.store_download(db=db, download=download, content=content) for stories_id in story_ids_to_extract: log.debug( f"Adding story {stories_id} for download {downloads_id} to extraction queue..." ) JobBroker( queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=stories_id) log.info(f"Handled download {downloads_id}...") log.debug( f"(URL of download {downloads_id} that was just handled: {download_url})" )
def __solr_error_message_from_response(response: Response) -> str: """Parse out Solr error message from response.""" if response.error_is_client_side(): # UserAgent error (UserAgent wasn't able to connect to the server or something like that) error_message = f'UserAgent error: {response.decoded_content()}' else: status_code_str = str(response.code()) if status_code_str.startswith('4'): # Client error - set default message error_message = f'Client error: {response.status_line()} {response.decoded_content()}' # Parse out Solr error message if there is one solr_response_maybe_json = response.decoded_content() if solr_response_maybe_json: solr_response_json = {} try: solr_response_json = response.decoded_json() except Exception as ex: log.debug( f"Unable to parse Solr error response: {ex}; raw response: {solr_response_maybe_json}" ) error_message = solr_response_json.get('error', {}).get('msg', {}) request_params = solr_response_json.get('responseHeader', {}).get('params', {}) if error_message and request_params: request_params_json = encode_json(request_params) # If we were able to decode Solr error message, overwrite the default error message with it error_message = f'Solr error: "{error_message}", params: {request_params_json}' elif status_code_str.startswith('5'): # Server error or some other error error_message = f'Server error: {response.status_line()} {response.decoded_content()}' else: # Some weird stuff error_message = f'Other error error: {response.status_line()} {response.decoded_content()}' return error_message