def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" file_name = entity_filename(entity) mime_type = entity.first('mimeType') log.info('Converting [%s] to PDF...', file_name) for attempt in count(1): try: with open(file_path, 'rb') as fh: files = {'file': (file_name, fh, mime_type)} res = requests.post(CONVERT_URL, params={'timeout': CONVERT_TIMEOUT}, files=files, timeout=CONVERT_TIMEOUT + 10, stream=True) res.raise_for_status() out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise ProcessingException("Could not be converted to PDF.") except HTTPError as exc: if exc.response.status_code == 400: raise ProcessingException(res.text) msg = "Converter not availble: %s (attempt: %s)" log.info(msg, exc, attempt) backoff(failures=math.sqrt(attempt)) except RequestException as exc: msg = "Converter not availble: %s (attempt: %s)" log.error(msg, exc, attempt) backoff(failures=math.sqrt(attempt))
def _document_to_pdf(self, file_path, result, work_path): """Converts an office document to PDF.""" log.info('Converting [%s] to PDF...', result.file_name) out_path = os.path.basename(file_path) out_path = join_path(work_path, '%s.pdf' % out_path) file_name = result.file_name or 'data' mime_type = result.mime_type or DEFAULT attempt = 1 for attempt in service_retries(): fh = open(file_path, 'rb') try: files = {'file': (file_name, fh, mime_type)} res = requests.post(self.SERVICE_URL, files=files, timeout=(5, 305), stream=True) res.raise_for_status() with open(out_path, 'wb') as fh: for chunk in res.iter_content(chunk_size=None): fh.write(chunk) return out_path except RequestException as exc: if isinstance(exc, HTTPError): if exc.response.status_code == 400: raise ProcessingException(exc.response.text) log.error("Conversion failed: %s", exc) backoff(failures=attempt) finally: fh.close() raise ProcessingException("Document could not be converted to PDF.")
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" if UNOSERVICE_URL is None: raise RuntimeError("No UNOSERVICE_URL for document conversion.") log.info('Converting [%s] to PDF...', entity.first('fileName')) file_name = entity.first('fileName') or 'data' mime_type = entity.first('mimeType') or DEFAULT attempt = 1 for attempt in service_retries(): fh = open(file_path, 'rb') try: files = {'file': (file_name, fh, mime_type)} res = requests.post(UNOSERVICE_URL, files=files, timeout=(5, 305), stream=True) if res.status_code > 399: raise ProcessingException(res.text) out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path except RequestException as exc: log.error("Conversion failed: %s", exc) backoff(failures=attempt) finally: fh.close() raise ProcessingException("Document could not be converted to PDF.")
def _delete_blob(self, blob): for attempt in service_retries(): try: blob.delete() return except NotFound: return except FAILURES: log.exception("Delete error in GS") backoff(failures=attempt)
def wait_for_redis(conn): """Wait for redis to load its data into memory on initial system bootup.""" for attempt in service_retries(): try: conn.get('test_redis_ready') return conn except BusyLoadingError: log.info("Waiting for redis to load...") backoff(failures=attempt) raise RuntimeError("Redis is not ready.")
def index_safe(index, id, body, **kwargs): """Index a single document and retry until it has been stored.""" for attempt in service_retries(): try: es.index(index=index, id=id, body=body, **kwargs) body['id'] = str(id) body.pop('text', None) return body except TransportError as exc: log.warning("Index error [%s:%s]: %s", index, id, exc) backoff(failures=attempt)
def wait_for_redis(pool): """Wait for redis to load its data into memory on initial system bootup.""" for attempt in service_retries(): try: conn = Redis(connection_pool=pool, decode_responses=True) conn.ping() return except BusyLoadingError: log.info("Waiting for redis to load...") backoff(failures=attempt) raise RuntimeError("Redis is not ready.")
def index_safe(index, id, body, **kwargs): """Index a single document and retry until it has been stored.""" for attempt in service_retries(): try: es.index(index=index, id=id, body=body, **kwargs) body["id"] = str(id) body.pop("text", None) return body except TransportError as exc: if exc.status_code in ("400", "403"): raise log.warning("Index error [%s:%s]: %s", index, id, exc) backoff(failures=attempt)
def handle_done(cls, queue): if not queue.is_done(): return # HACK: randomly wait a little to avoid double-triggering the # index process. backoff() index = ServiceQueue(queue.conn, ServiceQueue.OP_INDEX, queue.dataset, priority=queue.priority) if index.is_done(): log.info("Ingest %r finished, queue index...", queue.dataset) index.queue_task({}, {}) queue.remove()
def get_es(): url = settings.ELASTICSEARCH_URL timeout = settings.ELASTICSEARCH_TIMEOUT for attempt in service_retries(): try: if not hasattr(settings, "_es_instance"): es = Elasticsearch(url, timeout=timeout) es.info() settings._es_instance = es return settings._es_instance except TransportError as exc: log.warning("ElasticSearch error: %s", exc.error) backoff(failures=attempt) raise RuntimeError("Could not connect to ElasticSearch")
def query_delete(index, query, sync=False, **kwargs): "Delete all documents matching the given query inside the index." for attempt in service_retries(): try: es.delete_by_query(index=index, body={'query': query}, conflicts='proceed', wait_for_completion=sync, refresh=refresh_sync(sync), **kwargs) return except TransportError as exc: log.warning("Query delete failed: %s", exc) backoff(failures=attempt)
def load_file(self, content_hash, file_name=None, temp_path=None): """Retrieve a file from Google storage and put it onto the local file system for further processing.""" for attempt in service_retries(): try: blob = self._locate_contenthash(content_hash) if blob is not None: path = self._local_path(content_hash, file_name, temp_path) blob.download_to_filename(path) return path except FAILURES: log.exception("Load error in GS") backoff(failures=attempt) # Returns None for "persistent error" as well as "file not found" :/ log.debug("[%s] not found, or the backend is down.", content_hash)
def query_delete(index, query, sync=False, **kwargs): "Delete all documents matching the given query inside the index." for attempt in service_retries(): try: es.delete_by_query(index=index, body={'query': query}, conflicts='proceed', wait_for_completion=sync, refresh=refresh_sync(sync), request_timeout=MAX_REQUEST_TIMEOUT, timeout=MAX_TIMEOUT, **kwargs) return except TransportError as exc: if int(exc.status_code) in (400, 403): raise log.warning("Query delete failed: %s", exc) backoff(failures=attempt)
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" # Attempt to guess an appropriate time for processing # Guessed: 15s per MB of data, max. file_size = file_path.stat().st_size if file_size < 100: return ProcessingException("Document too small.") file_size = (file_size / 1024) / 1024 # megabyte timeout = int(min(600, max(20, file_size * 15))) file_name = entity_filename(entity) mime_type = entity.first('mimeType') log.info('Converting [%s] to PDF (%ds timeout)...', file_name, timeout) failed = ProcessingException("Document could not be converted to PDF.") for attempt in service_retries(): try: with open(file_path, 'rb') as fh: files = {'file': (file_name, fh, mime_type)} res = requests.post(CONVERT_URL, params={'timeout': timeout}, files=files, timeout=timeout + 3, stream=True) res.raise_for_status() out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise failed except RequestException as exc: if isinstance(exc, HTTPError) and \ exc.response.status_code == 400: raise ProcessingException(res.text) log.error("Conversion failed: %s", exc) backoff(failures=math.sqrt(attempt)) raise failed
def get_es(): url = settings.ELASTICSEARCH_URL timeout = settings.ELASTICSEARCH_TIMEOUT for attempt in service_retries(): try: if not hasattr(settings, "_es_instance"): # When logging structured logs, use a custom transport to log # all es queries and their response time if sls.LOG_FORMAT == LOG_FORMAT_JSON: es = Elasticsearch(url, transport_class=LoggingTransport, timeout=timeout) else: es = Elasticsearch(url, timeout=timeout) es.info() settings._es_instance = es return settings._es_instance except TransportError as exc: log.warning("ElasticSearch error: %s", exc.error) backoff(failures=attempt) raise RuntimeError("Could not connect to ElasticSearch")
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" file_name = entity_filename(entity) mime_type = entity.first("mimeType") for attempt in count(1): log.debug("Converting [%s] to PDF (attempt %d)...", entity, attempt) try: with open(file_path, "rb") as fh: files = {"file": (file_name, fh, mime_type)} res = requests.post( CONVERT_URL, params={"timeout": CONVERT_TIMEOUT}, files=files, timeout=CONVERT_TIMEOUT + 10, stream=True, ) res.raise_for_status() out_path = self.make_work_file("out.pdf") with open(out_path, "wb") as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise ProcessingException("Could not be converted to PDF.") except HTTPError as exc: if exc.response.status_code in (400, 500): # For error 500, this might also be a temporary error # in the conversion service. But all attempts to divy # these phenomena apart have failed so far. raise ProcessingException(res.text) msg = "Converter not available: %s (attempt: %s)" log.info(msg, exc, attempt) backoff(failures=math.sqrt(attempt)) except RequestException as exc: msg = "Converter not available: %s (attempt: %s)" log.error(msg, exc, attempt) backoff(failures=math.sqrt(attempt))
def archive_file(self, file_path, content_hash=None, mime_type=None): """Store the file located at the given path on Google, based on a path made up from its SHA1 content hash.""" file_path = ensure_path(file_path) if content_hash is None: content_hash = checksum(file_path) if content_hash is None: return file_path = ensure_posix_path(file_path) for attempt in service_retries(): try: # blob = self._locate_contenthash(content_hash) # if blob is not None: # return content_hash path = os.path.join(path_prefix(content_hash), "data") blob = Blob(path, self.bucket) blob.upload_from_filename(file_path, content_type=mime_type) return content_hash except FAILURES: log.exception("Store error in GS") backoff(failures=attempt)