def test_meeting_start_change(): """ As there are meetings without an associated id, we can't use oparl_id as unique_id. But since name+start are unique in the db and the start of a meeting can be updated to the actual start after the meeting happened, we need to hard delete old meetings or the import will crash with a failed unque constraint """ organizations = [Organization("City Council", 1, True)] meetings_old = [ Meeting( "City Council", "City Council Meeting 1", None, None, None, start=datetime.fromisoformat("2020-01-01T09:00:00+01:00"), ), Meeting( "City Council", "City Council Meeting 2", None, None, 2, start=datetime.fromisoformat("2020-02-01T09:00:00+01:00"), ), ] meetings_new = [ Meeting( "City Council", "City Council Meeting 1", None, None, None, start=datetime.fromisoformat("2020-01-01T09:00:10+01:00"), ), Meeting( "City Council", "City Council Meeting 2", None, None, 2, start=datetime.fromisoformat("2020-02-01T09:00:05+01:00"), ), ] old = RisData(sample_city, None, [], organizations, [], [], meetings_old, [], [], 2) new = RisData(sample_city, None, [], organizations, [], [], meetings_new, [], [], 2) body = Body(name=old.meta.name, short_name=old.meta.name, ags=old.meta.ags) body.save() import_data(body, old) import_data(body, new) assert models.Meeting.objects.count() == 2 # The old meeting without id should have been deleted assert models.Meeting.objects_with_deleted.count() == 3
def test_agenda_item_with_id_name_changed(): organizations = [Organization("City Council", 1, True)] meetings = [ Meeting( "City Council", "City Council Meeting 1", None, None, 1, start=datetime.fromisoformat("2020-01-01T09:00:00+01:00"), ) ] agenda_items_old = [ AgendaItem( key="1", position=0, name="Old name", meeting_id=1, paper_reference=None, paper_original_id=None, original_id=1, result=None, voting=None, note=None, ) ] agenda_items_new = [ AgendaItem( key="1", position=0, name="New name", meeting_id=1, paper_reference=None, paper_original_id=None, original_id=1, result=None, voting=None, note=None, ) ] old = RisData(sample_city, None, [], organizations, [], [], meetings, [], agenda_items_old, 2) new = RisData(sample_city, None, [], organizations, [], [], meetings, [], agenda_items_new, 2) body = Body(name=old.meta.name, short_name=old.meta.name, ags=old.meta.ags) body.save() import_data(body, old) import_data(body, new) assert models.AgendaItem.objects_with_deleted.count() == 1 assert models.AgendaItem.objects.count() == 1
def handle_counts(ris_data: RisData, allow_shrinkage: bool): """Prints the old and new counts and makes sure we don't accidentally delete entries""" existing_counts = { "Paper": models.Paper.objects.count(), "File": models.File.objects.count(), "Person": models.Person.objects.count(), "Meeting": models.Meeting.objects.count(), "Organization": models.Organization.objects.count(), "Membership": models.Membership.objects.count(), "Agenda Item": models.AgendaItem.objects.count(), } new_counts = ris_data.get_counts() formatter = lambda x: " | ".join(f"{key_} {value_}" for key_, value_ in x.items()) logger.info(f"Existing: {formatter(existing_counts)}") logger.info(f"New: {formatter(new_counts)}") if not allow_shrinkage: for key, value in existing_counts.items(): # TODO: This check currently doesn't work because there's a fixup creating persons in the membership part if key == "Person": continue # The -3 is to allow some deletion or some failed page if new_counts[key] < value - 3: raise RuntimeError( f"There are {value} {key} in the database, but only {new_counts[key]} in " f"the imported dataset. This indicates a scraper failure. " f"Use `--allow-shrinkage` to override.")
def test_undelete(): """A paper gets created, (spuriously?) deleted, and then undeleted""" with_paper = RisData(sample_city, None, [], [], [sample_paper], [], [], [], [], 2) without_paper = RisData(sample_city, None, [], [], [], [], [], [], [], 2) body = Body( name=with_paper.meta.name, short_name=with_paper.meta.name, ags=with_paper.meta.ags, ) body.save() import_data(body, with_paper) import_data(body, without_paper) import_data(body, with_paper) [paper] = models.Paper.objects_with_deleted.all() assert not paper.deleted
def handle(self, *args, **options): input_file: Path = Path("../scrape-session/out/json/Karlsruhe.json") with input_file.open() as fp: data = json.load(fp) start = time.time() ris_data: RisData = RisData.from_dict(data) end = time.time() print(end - start, type(ris_data))
def test_manual_deletion(pytestconfig): """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio""" url = "https://example.org/file/1" file_id = 1 sample_file = File( name="Bad File", original_id=file_id, url=url, claimed_size=None, paper_original_id=sample_paper.original_id, ) data = RisData(sample_city, None, [], [], [sample_paper], [sample_file], [], [], [], 2) body = Body(name=data.meta.name, short_name=data.meta.name, ags=data.meta.ags) body.save() import_data(body, data) with responses.RequestsMock() as requests_mock: requests_mock.add( responses.GET, url, body=Path(pytestconfig.rootdir).joinpath( "testdata/media/file.txt").read_bytes(), status=200, content_type="text/plain", ) importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 1 and failed == 0 # Ensure that the file is there assert minio_client().get_object(minio_file_bucket, str(file_id)) assert models.File.objects.filter(pk=file_id).first() # This is what we test models.File.objects.get(pk=file_id).manually_delete() with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id)) # Another import, to ensure that manually delete is respected import_data(body, data) assert not models.File.objects.filter(pk=file_id).first() with responses.RequestsMock(): importer = Importer(BaseLoader({}), force_singlethread=True) [successful, failed] = importer.load_files(sample_city.name) assert successful == 0 and failed == 0 with pytest.raises(MinioException): minio_client().get_object(minio_file_bucket, str(file_id))
def test_index_deletion(): """Check that deleted persons get deleted from the elasticsearch index""" for index in registry.get_indices(registry.get_models()): index.delete(ignore=404) for index in registry.get_indices(registry.get_models()): index.create() old_persons = [ Person(name="Frank Underwood", party="Democrats"), Person(name="Claire Underwood", party="Democrats"), ] new_persons = [Person(name="Claire Underwood", party="Democrats")] old = RisData(sample_city, None, old_persons, [], [], [], [], [], [], 2) new = RisData(sample_city, None, new_persons, [], [], [], [], [], [], 2) body = Body(name=old.meta.name, short_name=old.meta.name, ags=old.meta.ags) body.save() import_data(body, old) assert len(MainappSearch({"query": "Underwood"}).execute().hits) == 2 import_data(body, new) assert len(MainappSearch({"query": "Underwood"}).execute().hits) == 1
def make_sample_file(file_id, url): sample_file = File( name="Bad File", original_id=file_id, url=url, claimed_size=None, paper_original_id=sample_paper.original_id, ) data = RisData(sample_city, None, [], [], [sample_paper], [sample_file], [], [], [], 2) body = Body(name=data.meta.name, short_name=data.meta.name, ags=data.meta.ags) body.save() import_data(body, data) return body, data
def test_duplicate_meetings_with_id(fixture, target_number, target_number_with_deleted): """ There are two meetings with the same name/start, and a) different ids, b) with and without id, c) without ids. Inspired by https://ris.wuppertal.de/si0057.php?__ksinr=18329 and https://ris.wuppertal.de/si0057.php?__ksinr=18837 """ for meeting in serializers.deserialize("json", Path(fixture).read_text()): meeting.save() new_meeting = converter.structure( { "organization_name": "BV Uellendahl-Katernberg", "name": "BV Uellendahl-Katernberg", "location": "Rathaus Barmen, Ratssaal, Johannes-Rau-Platz 1, 42275 Wuppertal", "note": None, "original_id": 18329, "start": "2020-04-23T18:30:00+02:00", "end": "2020-04-23T19:20:00+02:00", "cancelled": False, }, Meeting, ) with_paper = RisData(sample_city, None, [], [], [], [], [new_meeting], [], [], 2) body = Body( name=with_paper.meta.name, short_name=with_paper.meta.name, ags=with_paper.meta.ags, ) body.save() import_data(body, with_paper) assert models.Meeting.objects.count() == target_number, list( models.Meeting.objects.values_list("oparl_id", "name", "start")) assert models.Meeting.objects_with_deleted.count( ) == target_number_with_deleted