def download_and_analyze_file(self, file_id: int, address_pipeline: AddressPipeline, fallback_city: str) -> bool: """ Downloads and analyses a single file, i.e. extracting text, locations and persons. Returns False for http errors on downloading and True otherwise. """ file = File.objects.get(id=file_id) url = file.get_oparl_url() with NamedTemporaryFile() as tmpfile: try: content, content_type = self.loader.load_file(url) if content_type and file.mime_type and content_type != file.mime_type: logger.warning( "Diverging mime types: Expected {}, got {}".format( file.mime_type, content_type)) file.mime_type = content_type or file.mime_type tmpfile.write(content) tmpfile.file.seek(0) file.filesize = len(content) except HTTPError: logger.exception("File {}: Failed to download {}".format( file.id, url)) return False logger.debug("File {}: Downloaded {} ({}, {})".format( file.id, url, file.mime_type, filesizeformat(file.filesize))) minio_client().put_object( minio_file_bucket, str(file.id), tmpfile.file, file.filesize, content_type=file.mime_type, ) # If the api has text, keep that if self.download_files and not file.parsed_text: file.parsed_text, file.page_count = extract_from_file( tmpfile.file, tmpfile.name, file.mime_type, file.id) if file.parsed_text: locations = extract_locations(file.parsed_text, pipeline=address_pipeline, fallback_city=fallback_city) file.locations.set(locations) persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.mentioned_persons.set(persons) logger.debug("File {}: Found {} locations and {} persons".format( file.id, len(locations), len(persons))) else: logger.warning("File {}: Couldn't get any text".format(file.id)) db.connections.close_all() file.save() return True
def remove_pgp_key(self): # If the user clicks "remove" when the key is already removed, we can ignore that if not self.pgp_key_fingerprint: return minio_client().remove_object(minio_pgp_keys_bucket, self.pgp_key_fingerprint) self.pgp_key_fingerprint = None self.save()
def add_pgp_key(self, pgp_key_fingerprint: str, pgp_key: str): """ This should eventually be abstracted away into a file manager class """ key_bytes = pgp_key.encode() minio_client().put_object( minio_pgp_keys_bucket, pgp_key_fingerprint, BytesIO(key_bytes), len(key_bytes), ) self.pgp_key_fingerprint = pgp_key_fingerprint self.save()
def test_manual_deletion(pytestconfig): """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio""" url = "https://example.org/file/1" file_id = 1 sample_file = File( name="Bad File", original_id=file_id, url=url, claimed_size=None, paper_original_id=sample_paper.original_id, ) data = RisData(sample_city, None, [], [], [sample_paper], [sample_file], [], [], [], 2) body = Body(name=data.meta.name, short_name=data.meta.name, ags=data.meta.ags) body.save() import_data(body, data) with responses.RequestsMock() as requests_mock: requests_mock.add( responses.GET, url, body=Path(pytestconfig.rootdir).joinpath( "testdata/media/file.txt").read_bytes(), status=200, content_type="text/plain", ) importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 1 and failed == 0 # Ensure that the file is there assert minio_client().get_object(minio_file_bucket, str(file_id)) assert models.File.objects.filter(pk=file_id).first() # This is what we test models.File.objects.get(pk=file_id).manually_delete() with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id)) # Another import, to ensure that manually delete is respected import_data(body, data) assert not models.File.objects.filter(pk=file_id).first() with responses.RequestsMock(): importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 0 and failed == 0 with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id))
def test_manual_deletion(pytestconfig, caplog): """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio""" url = "https://example.org/file/1" file_id = 1 body, data = make_sample_file(file_id, url) with responses.RequestsMock() as requests_mock: requests_mock.add( responses.GET, url, body=pytestconfig.rootpath.joinpath( "testdata/media/file.txt").read_bytes(), status=200, content_type="text/plain", ) importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 1 and failed == 0 # Ensure that the file is there assert minio_client().get_object(minio_file_bucket, str(file_id)) assert models.File.objects.filter(pk=file_id).first() # This is what we test models.File.objects.get(pk=file_id).manually_delete() with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id)) # Another import, to ensure that manually delete is respected import_data(body, data) assert not models.File.objects.filter(pk=file_id).first() with responses.RequestsMock(): importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 0 and failed == 0 with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id)) assert caplog.messages == [ "File 1 has an unknown mime type: 'text/plain'", "File 1: Couldn't get any text", ]
def get_pgp_key(self) -> Optional[bytes]: """ Returns fingerprint and key """ if not self.pgp_key_fingerprint: return None return ( minio_client() .get_object(minio_pgp_keys_bucket, self.pgp_key_fingerprint) .read() )
def file_serve(request, id): logger.warning("Serving media files through django is slow") minio_file = minio_client().get_object(minio_file_bucket, id) response = HttpResponse(minio_file.read()) response["Content-Type"] = minio_file.headers["Content-Type"] if settings.SITE_SEO_NOINDEX: response["X-Robots-Tag"] = "noindex" return response
def handle(self, *args, **options): for name, obj in inspect.getmembers(models): if (not inspect.isclass(obj) or not issubclass(obj, Model) or name in ["DefaultFields"]): continue self.stdout.write(f"{name}: {obj.objects.count()}") files_total = File.objects.count() files_with_text = File.objects.filter( parsed_text__isnull=False).count() files_with_location = (File.objects.annotate( location_count=Count("locations")).filter( location_count__gte=1).count()) files_with_persons = (File.objects.annotate( persons_count=Count("mentioned_persons")).filter( persons_count__gte=1).count()) files_not_downloaded = File.objects.filter( filesize__isnull=True, oparl_access_url__isnull=False).count() files_without_url = File.objects.filter( oparl_access_url__isnull=True).count() self.stdout.write( f"Files total: {files_total}; with text: {files_with_text}; " f"with locations: {files_with_location}; with persons: {files_with_persons}; " f"not downloaded: {files_not_downloaded}; without url: {files_without_url}" ) bodies_with_outline = Body.objects.filter( outline__isnull=False).count() bodies_with_ags = Body.objects.filter(ags__isnull=False).count() self.stdout.write( f"Bodies with an outline: {bodies_with_outline}; with an ags: {bodies_with_ags}" ) users_with_alerts = UserAlert.objects.values("user").distinct().count() users = User.objects.count() alerts = UserAlert.objects.count() self.stdout.write( f"There are {alerts} alerts by {users_with_alerts} of {users} users" ) # Check if there are files which are listed as imported but aren't in minio # We convert everything to strings because there might be non-numeric files in minio existing_files = set( file.object_name for file in minio_client().list_objects(minio_file_bucket)) expected_files = set( str(i) for i in File.objects.filter( filesize__gt=0).values_list("id", flat=True)) missing_files = len(expected_files - existing_files) if missing_files > 0: self.stdout.write( f"{missing_files} files are marked as imported but aren't available in minio" )
def file_serve(request, id): logger.warning("Serving media files through django is slow") """ Ensure that the file is not deleted in the database """ get_object_or_404(File, id=id) minio_file = minio_client().get_object(minio_file_bucket, id) response = HttpResponse(minio_file.read()) response["Content-Type"] = minio_file.headers["Content-Type"] if settings.SITE_SEO_NOINDEX: response["X-Robots-Tag"] = "noindex" return response
def handle(self, *args, **options): existing_files = set( int(file.object_name) for file in minio_client().list_objects(minio_file_bucket) ) expected_files: Set[int] = set( File.objects.filter(filesize__gt=0).values_list("id", flat=True) ) missing_files = expected_files - existing_files if len(missing_files) > 0: self.stdout.write( f"{missing_files} files are marked as imported but aren't available in minio" ) File.objects.filter(id__in=missing_files).update(filesize=None)
def parse_file(self, file: File, fallback_city: str): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") with minio_client().get_object(minio_file_bucket, str(file.id)) as file_handle: recognized_text = get_ocr_text_from_pdf(file_handle.read()) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations.set( extract_locations(file.parsed_text, fallback_city)) file.save() else: logging.warning("Nothing recognized")
def manually_delete(self): """Sometimes we need to delete files even if they were not deleted at the source""" self.deleted = True self.manually_deleted = True self.save() minio_client().remove_object(minio_file_bucket, str(self.id))
def download_and_analyze_file(self, file_id: int, address_pipeline: AddressPipeline, fallback_city: str) -> bool: """ Downloads and analyses a single file, i.e. extracting text, locations and persons. Returns False for http errors on downloading and True otherwise. """ file = File.objects.get(id=file_id) url = file.get_oparl_url() with NamedTemporaryFile() as tmp_file: try: content, content_type = self.loader.load_file(url) if content_type and file.mime_type and content_type != file.mime_type: logger.warning( "Diverging mime types: Expected {}, got {}".format( file.mime_type, content_type)) if content_type and content_type.split(";")[0] == "text/html": logger.error( f"File {file.id}: Content type was {content_type}, this seems to be a silent error" ) return False file.mime_type = content_type or file.mime_type tmp_file.write(content) tmp_file.file.seek(0) file.filesize = len(content) except RequestException as e: # Normal server error if e.response and 400 <= e.response.status_code < 600: logger.error( f"File {file.id}: Failed to download {url} with error {e.response.status_code}" ) else: logger.exception( f"File {file.id}: Failed to download {url}") return False logger.debug("File {}: Downloaded {} ({}, {})".format( file.id, url, file.mime_type, filesizeformat(file.filesize))) if not settings.PROXY_ONLY_TEMPLATE: minio_client().put_object( minio_file_bucket, str(file.id), tmp_file.file, file.filesize, content_type=file.mime_type, ) # If the api has text, keep that if self.download_files and not file.parsed_text: file.parsed_text, file.page_count = extract_from_file( tmp_file.file, tmp_file.name, file.mime_type, file.id) if file.parsed_text: locations = extract_locations(file.parsed_text, pipeline=address_pipeline, fallback_city=fallback_city) file.locations.set(locations) persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.mentioned_persons.set(persons) logger.debug("File {}: Found {} locations and {} persons".format( file.id, len(locations), len(persons))) else: logger.warning(f"File {file.id}: Couldn't get any text") try: db.connections.close_all() file.save() except (ElasticsearchException, DatabaseError) as e: logger.exception(f"File {file.id}: Failed to save: {e}") return False return True