def download_and_analyze_file(self, file_id: int, address_pipeline: AddressPipeline, fallback_city: str) -> bool: """ Downloads and analyses a single file, i.e. extracting text, locations and persons. Returns False for http errors on downloading and True otherwise. """ file = File.objects.get(id=file_id) url = file.get_oparl_url() with NamedTemporaryFile() as tmpfile: try: content, content_type = self.loader.load_file(url) if content_type and file.mime_type and content_type != file.mime_type: logger.warning( "Diverging mime types: Expected {}, got {}".format( file.mime_type, content_type)) file.mime_type = content_type or file.mime_type tmpfile.write(content) tmpfile.file.seek(0) file.filesize = len(content) except HTTPError: logger.exception("File {}: Failed to download {}".format( file.id, url)) return False logger.debug("File {}: Downloaded {} ({}, {})".format( file.id, url, file.mime_type, filesizeformat(file.filesize))) minio_client().put_object( minio_file_bucket, str(file.id), tmpfile.file, file.filesize, content_type=file.mime_type, ) # If the api has text, keep that if self.download_files and not file.parsed_text: file.parsed_text, file.page_count = extract_from_file( tmpfile.file, tmpfile.name, file.mime_type, file.id) if file.parsed_text: locations = extract_locations(file.parsed_text, pipeline=address_pipeline, fallback_city=fallback_city) file.locations.set(locations) persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.mentioned_persons.set(persons) logger.debug("File {}: Found {} locations and {} persons".format( file.id, len(locations), len(persons))) else: logger.warning("File {}: Couldn't get any text".format(file.id)) db.connections.close_all() file.save() return True
def test_person_extraction(self): frank = Person.objects.get(pk=1) doug = Person.objects.get(pk=4) will = Person.objects.get(pk=7) text = "A text \nabout Frank Underwood, Stamper, Doug, and a \nmisspelled WilliamConway." persons = extract_persons(text) self.assertTrue(doug in persons) self.assertTrue(frank in persons) self.assertFalse(will in persons) text = "Also the more formal name, \"Underwood, Francis\" should be found." persons = extract_persons(text) self.assertFalse(doug in persons) self.assertTrue(frank in persons) self.assertFalse(will in persons)
def test_person_extraction(self): frank = Person.objects.get(pk=1) doug = Person.objects.get(pk=4) will = Person.objects.get(pk=7) text = "A text \nabout Frank Underwood, Stamper, Doug, and a \nmisspelled WilliamConway." persons = extract_persons(text) self.assertTrue(doug in persons) self.assertTrue(frank in persons) self.assertTrue(will not in persons) text = 'Also the more formal name, "Underwood, Francis" should be found.' persons = extract_persons(text) self.assertTrue(doug not in persons) self.assertTrue(frank in persons) self.assertTrue(will not in persons) text = "We should check word boundaries like Doug Stampering something." persons = extract_persons(text) self.assertTrue(doug not in persons)
def parse_file(self, file: File, fallback_city: str): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") with minio_client().get_object(minio_file_bucket, str(file.id)) as file_handle: recognized_text = get_ocr_text_from_pdf(file_handle.read()) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations.set( extract_locations(file.parsed_text, fallback_city)) file.save() else: logging.warning("Nothing recognized")
def parse_file(self, file: File): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") file_path = os.path.abspath(os.path.dirname(__name__)) file_path = os.path.join(file_path, settings.MEDIA_ROOT, file.storage_filename) recognized_text = get_ocr_text_from_pdf(file_path) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations = extract_locations(recognized_text) file.save() else: logging.warning("Nothing recognized")
def file(self, libobject: OParl.File): file, do_update = self.check_for_modification(libobject, File) if not file or not do_update: return file self.logger.info("Processing File {}".format(libobject.get_id())) if libobject.get_file_name(): displayed_filename = libobject.get_file_name() elif libobject.get_name(): extension = mimetypes.guess_extension("application/pdf") or "" length = self.filename_length_cutoff - len(extension) displayed_filename = slugify( libobject.get_name())[:length] + extension else: displayed_filename = slugify( libobject.get_access_url())[-self.filename_length_cutoff:] parsed_text_before = file.parsed_text file_name_before = file.name file.oparl_id = libobject.get_id() file.name = libobject.get_name() file.displayed_filename = displayed_filename file.mime_type = libobject.get_mime_type( ) or "application/octet-stream" file.legal_date = self.glib_datetime_to_python_date( libobject.get_date()) file.sort_date = file.created file.oparl_access_url = libobject.get_access_url() file.oparl_download_url = libobject.get_download_url() # If no text comes from the API, don't overwrite previously extracted PDF-content with an empty string if libobject.get_text(): file.parsed_text = libobject.get_text() if self.download_files: self.download_file(file, libobject) else: file.storage_filename = "" file.filesize = -1 parsed_text = file.parsed_text if file.storage_filename and not file.parsed_text: parsed_text = self.extract_text_from_file(file) file = self.call_custom_hook("sanitize_file", file) if len(file.name) > 200: file.name = textwrap.wrap(file.name, 199)[0] + "\u2026" file.save() if file_name_before != file.name or parsed_text_before != file.parsed_text: # These two operations are rather CPU-intensive, so we only perform them if something relevant has changed file.locations = extract_locations(parsed_text) file.mentioned_persons = extract_persons(file.name + "\n" + (parsed_text or "") + "\n") file.save() return file
def download_and_analyze_file(self, file_id: int, address_pipeline: AddressPipeline, fallback_city: str) -> bool: """ Downloads and analyses a single file, i.e. extracting text, locations and persons. Returns False for http errors on downloading and True otherwise. """ file = File.objects.get(id=file_id) url = file.get_oparl_url() with NamedTemporaryFile() as tmp_file: try: content, content_type = self.loader.load_file(url) if content_type and file.mime_type and content_type != file.mime_type: logger.warning( "Diverging mime types: Expected {}, got {}".format( file.mime_type, content_type)) if content_type and content_type.split(";")[0] == "text/html": logger.error( f"File {file.id}: Content type was {content_type}, this seems to be a silent error" ) return False file.mime_type = content_type or file.mime_type tmp_file.write(content) tmp_file.file.seek(0) file.filesize = len(content) except RequestException as e: # Normal server error if e.response and 400 <= e.response.status_code < 600: logger.error( f"File {file.id}: Failed to download {url} with error {e.response.status_code}" ) else: logger.exception( f"File {file.id}: Failed to download {url}") return False logger.debug("File {}: Downloaded {} ({}, {})".format( file.id, url, file.mime_type, filesizeformat(file.filesize))) if not settings.PROXY_ONLY_TEMPLATE: minio_client().put_object( minio_file_bucket, str(file.id), tmp_file.file, file.filesize, content_type=file.mime_type, ) # If the api has text, keep that if self.download_files and not file.parsed_text: file.parsed_text, file.page_count = extract_from_file( tmp_file.file, tmp_file.name, file.mime_type, file.id) if file.parsed_text: locations = extract_locations(file.parsed_text, pipeline=address_pipeline, fallback_city=fallback_city) file.locations.set(locations) persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.mentioned_persons.set(persons) logger.debug("File {}: Found {} locations and {} persons".format( file.id, len(locations), len(persons))) else: logger.warning(f"File {file.id}: Couldn't get any text") try: db.connections.close_all() file.save() except (ElasticsearchException, DatabaseError) as e: logger.exception(f"File {file.id}: Failed to save: {e}") return False return True
def parse_file(self, file: File): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") file.mentioned_persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.save()