def test_manual_deletion(pytestconfig): """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio""" url = "https://example.org/file/1" file_id = 1 sample_file = File( name="Bad File", original_id=file_id, url=url, claimed_size=None, paper_original_id=sample_paper.original_id, ) data = RisData(sample_city, None, [], [], [sample_paper], [sample_file], [], [], [], 2) body = Body(name=data.meta.name, short_name=data.meta.name, ags=data.meta.ags) body.save() import_data(body, data) with responses.RequestsMock() as requests_mock: requests_mock.add( responses.GET, url, body=Path(pytestconfig.rootdir).joinpath( "testdata/media/file.txt").read_bytes(), status=200, content_type="text/plain", ) importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 1 and failed == 0 # Ensure that the file is there assert minio_client().get_object(minio_file_bucket, str(file_id)) assert models.File.objects.filter(pk=file_id).first() # This is what we test models.File.objects.get(pk=file_id).manually_delete() with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id)) # Another import, to ensure that manually delete is respected import_data(body, data) assert not models.File.objects.filter(pk=file_id).first() with responses.RequestsMock(): importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 0 and failed == 0 with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id))
def test_somacos_encoded_urls(): with responses.RequestsMock() as requests_mock: requests_mock.add( requests_mock.GET, "https://oparl.wuppertal.de/oparl/bodies/0001/papers?page=2&modified_since=2020-11-26T19:26:34+00:00", content_type="text/json; charset=utf-8", json={}, ) requests_mock.add( requests_mock.GET, "https://oparl.wuppertal.de/oparl/bodies/0001/papers?page=2&modified_since=2020-11-26T19%3A26%3A34%2B00%3A00", content_type="text/html", status=404, ) with pytest.raises(HTTPError): BaseLoader({}).load( "https://oparl.wuppertal.de/oparl/bodies/0001/papers", query={ "page": "2", "modified_since": "2020-11-26T19:26:34+00:00" }, ) loader = SomacosLoader({}) loader.load( "https://oparl.wuppertal.de/oparl/bodies/0001/papers", query={ "page": "2", "modified_since": "2020-11-26T19:26:34+00:00" }, )
def test_manual_deletion(pytestconfig, caplog): """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio""" url = "https://example.org/file/1" file_id = 1 body, data = make_sample_file(file_id, url) with responses.RequestsMock() as requests_mock: requests_mock.add( responses.GET, url, body=pytestconfig.rootpath.joinpath( "testdata/media/file.txt").read_bytes(), status=200, content_type="text/plain", ) importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 1 and failed == 0 # Ensure that the file is there assert minio_client().get_object(minio_file_bucket, str(file_id)) assert models.File.objects.filter(pk=file_id).first() # This is what we test models.File.objects.get(pk=file_id).manually_delete() with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id)) # Another import, to ensure that manually delete is respected import_data(body, data) assert not models.File.objects.filter(pk=file_id).first() with responses.RequestsMock(): importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 0 and failed == 0 with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id)) assert caplog.messages == [ "File 1 has an unknown mime type: 'text/plain'", "File 1: Couldn't get any text", ]
def test_load_file_oom(caplog): importer = MockImporter(BaseLoader({}), force_singlethread=True) with override_settings(SUBPROCESS_MAX_RAM=1 * 1024 * 1024): failed = importer.load_files_multiprocessing(AddressPipeline([]), "München", list(range(64))) assert failed == 1 assert caplog.messages == [ "File 1: Import failed du to excessive memory usage (Limit: 1048576)" ]
def handle(self, *args, **options): input_file: Path = options["input"] logger.info("Loading the data") with input_file.open() as fp: json_data = json.load(fp) if json_data["format_version"] != format_version: raise CommandError( f"This version of {settings.PRODUCT_NAME} can only import json format version {format_version}, " f"but the json file you provided is version {json_data['format_version']}" ) ris_data: RisData = converter.structure(json_data, RisData) body = models.Body.objects.filter(name=ris_data.meta.name).first() if not body: logger.info("Building the body") if options["ags"] or ris_data.meta.ags: ags = options["ags"] or ris_data.meta.ags else: ags = city_to_ags(ris_data.meta.name, False) if not ags: raise RuntimeError( f"Failed to determine the Amtliche Gemeindeschlüssel for '{ris_data.meta.name}'. " f"Please look it up yourself and specify it with `--ags`" ) logger.info(f"The Amtliche Gemeindeschlüssel is {ags}") body = models.Body( name=ris_data.meta.name, short_name=ris_data.meta.name, ags=ags ) body.save() if not options["skip_body_extra"]: import_outline(body) import_streets(body) else: logging.info("Using existing body") # TODO: Re-enable this after some more thorough testing # handle_counts(ris_data, options["allow_shrinkage"]) import_data(body, ris_data) fix_sort_date(datetime.datetime.now(tz=tz.tzlocal())) if not options["skip_download"]: Importer(BaseLoader(dict()), force_singlethread=True).load_files( fallback_city=body.short_name ) if not options["no_notify_users"]: logger.info("Sending notifications") NotifyUsers().notify_all()
def get_importer(self, options: Dict[str, Any]) -> Tuple[Importer, Body]: if options.get("body"): body = Body.objects.get(oparl_id=options["body"]) else: body = Body.objects.get(id=settings.SITE_DEFAULT_BODY) if body.oparl_id is not None: loader = get_loader_from_body(body.oparl_id) importer = Importer(loader, body, ignore_modified=options["ignore_modified"]) else: importer = Importer(BaseLoader(dict()), ignore_modified=options["ignore_modified"]) importer.force_singlethread = options["force_singlethread"] return importer, body
def test_file_404(pytestconfig, caplog): """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio""" url = "https://example.org/file/1" file_id = 1 make_sample_file(file_id, url) with responses.RequestsMock() as requests_mock: requests_mock.add(responses.GET, url, status=404, content_type="text/plain") importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name, update=True) assert successful == 0 and failed == 1 assert caplog.messages == [ f"File 1: Failed to download {url}", "1 files failed to download", ]
def test_missing_organization(caplog): with RequestsMock() as requests_mock: requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/230", json={"error": "not found"}, status=404, ) # Add another one to test for uniqueness constraints requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/231", json={"error": "not found"}, status=404, ) requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies", json={ "data": [ json.loads( Path("testdata/oparl-missing/body.json").read_text()) ], "links": {}, "pagination": {}, }, ) requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies/0001/organizations", json=empty_page, ) requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies/0001/people", json=empty_page, ) requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies/0001/papers", json=empty_page, ) requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies/0001/meetings", json={ "data": [ json.loads( Path( "testdata/oparl-missing/meeting.json").read_text()) ], "links": {}, "pagination": {}, }, ) requests_mock.add( requests_mock.GET, "http://oparl.wuppertal.de/oparl/bodies/0001/people/292", status=404, ) body_id = "http://oparl.wuppertal.de/oparl/bodies/0001" importer = Importer( BaseLoader( json.loads( Path("testdata/oparl-missing/system.json").read_text())), force_singlethread=True, ) [body_data] = importer.load_bodies(body_id) [body] = importer.import_bodies() importer.converter.default_body = body body.ags = "05124000" importer.fetch_lists_initial([body_data.data]) importer.import_objects() assert set(i.oparl_id for i in Organization.objects.all()) == { "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/230", "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/231", } assert list(i.short_name for i in Organization.objects.all()) == [ "Missing", "Missing", ] assert Person.objects.first().name == "Missing Person" assert caplog.messages == [ "The Person http://oparl.wuppertal.de/oparl/bodies/0001/people/292 linked " "from http://oparl.wuppertal.de/oparl/bodies/0001/meetings/19160 was supposed " "to be a part of the external lists, but was not. This is a bug in the OParl " "implementation.", "Failed to load http://oparl.wuppertal.de/oparl/bodies/0001/people/292: 404 " "Client Error: Not Found for url: " "http://oparl.wuppertal.de/oparl/bodies/0001/people/292", "Using a dummy for http://oparl.wuppertal.de/oparl/bodies/0001/people/292. " "THIS IS BAD.", "The Organization " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/230 linked from " "http://oparl.wuppertal.de/oparl/bodies/0001/meetings/19160 was supposed to " "be a part of the external lists, but was not. This is a bug in the OParl " "implementation.", "Failed to load " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/230: 404 Client " "Error: Not Found for url: " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/230", "Using a dummy for " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/230. THIS IS " "BAD.", "The Organization " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/231 linked from " "http://oparl.wuppertal.de/oparl/bodies/0001/meetings/19160 was supposed to " "be a part of the external lists, but was not. This is a bug in the OParl " "implementation.", "Failed to load " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/231: 404 Client " "Error: Not Found for url: " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/231", "Using a dummy for " "http://oparl.wuppertal.de/oparl/bodies/0001/organizations/gr/231. THIS IS " "BAD.", ]
def spurious_500(loader: BaseLoader): with responses.RequestsMock() as requests_mock: requests_mock.add( responses.GET, "https://ratsinfo.leipzig.de/bi/oparl/1.0/papers.asp?body=2387&p=2", json={"error": "spurious error"}, status=500, ) requests_mock.add( requests_mock.GET, "https://ratsinfo.leipzig.de/bi/oparl/1.0/papers.asp?body=2387&p=2", json={ "data": [{ "id": "https://ratsinfo.leipzig.de/bi/oparl/1.0/papers.asp?id=1000030", "type": "https://schema.oparl.org/1.0/Paper", "body": "https://ratsinfo.leipzig.de/bi/oparl/1.0/bodies.asp?id=2387", "name": "Konzept der Stadt Leipzig zur fairen und nachhaltigen Beschaffung\r\n(eRis: DS V/3966)", "reference": "DS-00029/14", "paperType": "Informationsvorlage", "date": "2014-09-09", "mainFile": { "id": "https://ratsinfo.leipzig.de/bi/oparl/1.0/files.asp?dtyp=130&id=1000487", "type": "https://schema.oparl.org/1.0/File", "name": "Vorlage-Sammeldokument", "fileName": "1000487.pdf", "mimeType": "application/pdf", "modified": "2018-12-05T19:23:53+01:00", "size": 211644, "accessUrl": "https://ratsinfo.leipzig.de/bi/oparl/1.0/download.asp?dtyp=130&id=1000487", }, "web": "N/Avo020.asp?VOLFDNR=1000030", "created": "2014-07-23T12:00:00+02:00", "modified": "2014-09-09T10:03:49+02:00", }], "pagination": { "elementsPerPage": 20 }, "links": { "first": "https://ratsinfo.leipzig.de/bi/oparl/1.0/papers.asp?body=2387", "prev": "https://ratsinfo.leipzig.de/bi/oparl/1.0/papers.asp?body=2387&p=1", "next": "https://ratsinfo.leipzig.de/bi/oparl/1.0/papers.asp?body=2387&p=3", }, }, ) data = loader.load( "https://ratsinfo.leipzig.de/bi/oparl/1.0/papers.asp?body=2387&p=2" ) assert len(data["data"]) == 1
def handle(self, *args, **options): input_file: Path = options["input"] logger.info("Loading the data") with input_file.open() as fp: json_data = json.load(fp) if json_data["format_version"] != format_version: raise CommandError( f"This version of {settings.PRODUCT_NAME} can only import json format version {format_version}, " f"but the json file you provided is version {json_data['format_version']}" ) ris_data: RisData = converter.structure(json_data, RisData) body = models.Body.objects.filter(name=ris_data.meta.name).first() if not body: logger.info("Building the body") if options["ags"] or ris_data.meta.ags: ags = options["ags"] or ris_data.meta.ags else: ags = city_to_ags(ris_data.meta.name, False) if not ags: raise RuntimeError( f"Failed to determine the Amtliche Gemeindeschlüssel for '{ris_data.meta.name}'. " f"Please look it up yourself and specify it with `--ags`" ) logger.info(f"The Amtliche Gemeindeschlüssel is {ags}") body = models.Body(name=ris_data.meta.name, short_name=ris_data.meta.name, ags=ags) body.save() if not options["skip_body_extra"]: import_outline(body) import_streets(body) else: logging.info("Using existing body") # TODO: Reenable this after some more thorough testing # handle_counts(ris_data, options["allow_shrinkage"]) flush_model(models.Paper) self.import_papers(ris_data) self.import_files(ris_data) paper_id_map = make_id_map(models.Paper.objects) file_id_map = make_id_map(models.File.objects) flush_model(models.Paper.files.through) self.import_paper_files(ris_data, paper_id_map, file_id_map) flush_model(models.Organization) self.import_organizations(body, ris_data) self.import_meeting_locations(ris_data) locations = dict( models.Location.objects.values_list("description", "id")) flush_model(models.Meeting) self.import_meetings(ris_data, locations) meeting_id_map = make_id_map( models.Meeting.objects.filter(oparl_id__isnull=False)) organization_name_id_map = dict( models.Organization.objects.values_list("name", "id")) flush_model(models.Meeting.organizations.through) self.import_meeting_organization(meeting_id_map, organization_name_id_map, ris_data) flush_model(models.Person) self.import_persons(ris_data) flush_model(models.Consultation) self.import_consultations(ris_data, meeting_id_map, paper_id_map) # We don't have original ids for all agenda items (yet?), # so we just assume meeting x paper is unique consultation_map = { (a, b): c for a, b, c in models.Consultation.objects.values_list( "meeting_id", "paper_id", "id") } flush_model(models.AgendaItem) self.import_agenda_items(ris_data, consultation_map, meeting_id_map, paper_id_map) flush_model(models.Membership) self.import_memberships(ris_data) fix_sort_date(fallback_date, datetime.datetime.now(tz=tz.tzlocal())) # With the current bulk indexing we need to do this manually call_command("search_index", action="populate") if not options["skip_download"]: Importer(BaseLoader(dict()), force_singlethread=True).load_files( fallback_city=body.short_name)