コード例 #1
0
    def test_person_no_name(self):
        data = {
            "id": "https://oparl.example.org/person/no-name",
            "type": "https://schema.oparl.org/1.1/Person",
            "created": "2011-11-11T11:11:00+01:00",
            "modified": "2012-08-16T14:05:27+02:00",
        }

        converter = JsonToDb(MockLoader())

        with self.assertLogs(json_to_db.__name__, level="WARNING") as cm:
            person = Person()
            converter.person(data, person)
            self.assertEqual(
                cm.output,
                [
                    "WARNING:"
                    + json_to_db.__name__
                    + ":Person without name: https://oparl.example.org/person/no-name",
                    "WARNING:"
                    + json_to_db.__name__
                    + ":Person without given name: https://oparl.example.org/person/no-name",
                    "WARNING:"
                    + json_to_db.__name__
                    + ":Person without family name: https://oparl.example.org/person/no-name",
                ],
            )
コード例 #2
0
    def test_person_only_name(self):
        data = {
            "id": "https://oparl.example.org/person/only-name",
            "type": "https://schema.oparl.org/1.1/Person",
            "name": "Max Mustermann",
            "created": "2011-11-11T11:11:00+01:00",
            "modified": "2012-08-16T14:05:27+02:00",
        }

        converter = JsonToDb(MockLoader())

        with self.assertLogs(json_to_db.__name__, level="WARNING") as cm:
            person = Person()
            converter.person(data, person)
            self.assertEqual(person.name, "Max Mustermann")
            self.assertEqual(person.given_name, "Max")
            self.assertEqual(person.family_name, "Mustermann")

            self.assertEqual(
                cm.output,
                [
                    "WARNING:"
                    + json_to_db.__name__
                    + ":Inferring given and family name from compound name"
                ],
            )
コード例 #3
0
    def __init__(
        self,
        loader: BaseLoader,
        default_body: Optional[Body] = None,
        ignore_modified: bool = False,
        download_files: bool = True,
        force_singlethread: bool = False,
    ):
        self.force_singlethread = force_singlethread
        self.ignore_modified = ignore_modified
        self.download_files = download_files

        self.loader = loader
        default_body = (
            default_body
            or Body.objects.filter(id=settings.SITE_DEFAULT_BODY).first())
        self.converter = JsonToDb(loader, default_body=default_body)
コード例 #4
0
    def setUpClass(cls):
        super().setUpClass()
        cls.api_data = {}
        cls.loader = MockLoader()
        cls.loader.api_data = cls.api_data
        for file in os.listdir(cls.dummy_data):
            if not file.endswith(".json"):
                continue

            with open(os.path.join(cls.dummy_data, file)) as fp:
                data = json.load(fp)
                cls.api_data[data["id"]] = data
                for entry in externalize(data):
                    if entry.data["id"] not in cls.api_data:
                        cls.api_data[entry.data["id"]] = entry.data

        # Used by test_location_default_body
        body = Body()
        body.short_name = "München"
        cls.converter = JsonToDb(cls.loader, default_body=body)
        cls.converter.warn_missing = False
        cls.utils = Utils()
コード例 #5
0
def test_json_to_db_empty_object(caplog):
    url = "https://lahr.ratsinfomanagement.net/webservice/oparl/v1.1/body/1/consultation/5999"
    loader = MockLoader(api_data={url: {}})
    converter = JsonToDb(loader,
                         default_body=Body(),
                         ensure_organization_type=False)
    with pytest.raises(
            RuntimeError,
            match=
            f"The object {url} has not type field and object_type wasn't given",
    ):
        converter.import_anything(url)
    converter.import_anything(url, Consultation)
    assert Consultation.objects.filter(oparl_id=url).count() == 1
    assert caplog.messages == [
        f"Object loaded from {url} has no type field, inferred to https://schema.oparl.org/1.0/Consultation",
        f"Object loaded from {url} has no id field, setting id to url",
    ]
コード例 #6
0
def test_json_to_db_missing_object(caplog):
    url = "https://lahr.ratsinfomanagement.net/webservice/oparl/v1.1/body/1/consultation/5999"
    loader = MockLoader(api_data={url: None})
    converter = JsonToDb(loader,
                         default_body=Body(),
                         ensure_organization_type=False)
    with pytest.raises(
            RuntimeError,
            match=
            rf"The object {url} is missing and the object type was not specified",
    ):
        converter.import_anything(url)
    converter.import_anything(url, Consultation)
    assert Consultation.objects.filter(oparl_id=url).count() == 1
    assert caplog.messages == [
        f"JSON loaded from {url} is not a dict/object. Using a dummy instead. THIS IS BAD",
        f"JSON loaded from {url} is not a dict/object. Using a dummy instead. THIS IS BAD",
    ]
コード例 #7
0
class Importer:
    lists = ["paper", "person", "meeting", "organization"]

    def __init__(
        self,
        loader: BaseLoader,
        default_body: Optional[Body] = None,
        ignore_modified: bool = False,
        download_files: bool = True,
        force_singlethread: bool = False,
    ):
        self.force_singlethread = force_singlethread
        self.ignore_modified = ignore_modified
        self.download_files = download_files

        self.loader = loader
        default_body = (
            default_body
            or Body.objects.filter(id=settings.SITE_DEFAULT_BODY).first())
        self.converter = JsonToDb(loader, default_body=default_body)

    def run(self, body_id: str) -> None:
        [body_data] = self.load_bodies(body_id)
        self.fetch_lists_initial([body_data.data])
        [body] = self.import_bodies()
        self.converter.default_body = body
        self.import_objects()

    def import_anything(self, oparl_id: str) -> DefaultFields:
        return self.converter.import_anything(oparl_id)

    def fetch_lists_initial(self, bodies: List[JSON]) -> None:
        all_lists = []
        for body_entry in bodies:
            for list_type in self.lists:
                all_lists.append(body_entry[list_type])

        if not self.force_singlethread:
            # These lists are implemented so extremely slow that this brings a leap in performance
            with ThreadPoolExecutor() as executor:
                list(executor.map(self.fetch_list_initial, all_lists))
        else:
            for external_list in all_lists:
                self.fetch_list_initial(external_list)

    T = TypeVar("T", bound=DefaultFields)

    def import_type(self,
                    type_class: Type[T],
                    update: bool = False) -> List[T]:
        """ Import all object of a given type """

        type_name = type_class.__name__
        import_function = self.converter.type_to_function(type_class)
        related_function = self.converter.type_to_related_function(type_class)

        # So couldn't we make this like much faster by using bulk_create and deferring the related_function
        # after that? Well we could if we were using postgres, because with mysql django doesn't set the
        # id after saving with bulk save, which means we can't use related_function unless doing
        # really ugly hacks

        all_to_import = CachedObject.objects.filter(
            to_import=True, oparl_type=type_name).all()

        logger.info("Importing all {} {} (update={})".format(
            all_to_import.count(), type_name, update))

        pbar = None
        if sys.stdout.isatty() and not settings.TESTING:
            pbar = tqdm(total=all_to_import.count())

        all_instances = []
        for to_import in all_to_import:
            if update:
                instance = (type_class.objects_with_deleted.filter(
                    oparl_id=to_import.url).first() or type_class())
            else:
                instance = type_class()
            self.converter.init_base(to_import.data, instance)
            if not instance.deleted:
                import_function(to_import.data, instance)
                self.converter.utils.call_custom_hook(
                    "sanitize_" + type_name.lower(), instance)

            instance.save()
            if related_function and not instance.deleted:
                related_function(to_import.data, instance)
            all_instances.append(instance)

            if pbar:
                pbar.update()

        if pbar:
            pbar.close()

        all_to_import.update(to_import=False)

        return all_instances

    def import_bodies(self, update: bool = False) -> List[Body]:
        self.import_type(LegislativeTerm, update)
        self.import_type(Location, update)
        return self.import_type(Body, update)

    def import_objects(self, update: bool = False) -> None:
        import_plan = [
            File,
            Person,
            Organization,
            Membership,
            Meeting,
            Paper,
            Consultation,
            AgendaItem,
        ]

        for type_class in import_plan:
            self.import_type(type_class, update)

    def load_bodies(self,
                    single_body_id: Optional[str] = None
                    ) -> List[CachedObject]:
        self.fetch_list_initial(self.loader.system["body"])
        if single_body_id:
            bodies = [CachedObject.objects.get(url=single_body_id)]
            CachedObject.objects.filter(
                to_import=True, oparl_type="Body").exclude(
                    url=single_body_id).update(to_import=False)
        else:
            bodies = list(
                CachedObject.objects.filter(to_import=True,
                                            oparl_type="Body").all())
        return bodies

    def fetch_list_initial(self, url: str) -> None:
        """ Saves a complete external list as flattened json to the database """
        logger.info("Fetching List {}".format(url))

        timestamp = timezone.now()
        next_url = url
        all_objects = set()
        while next_url:
            logger.info("Fetching {}".format(next_url))
            response = self.loader.load(next_url)

            objects = set()

            for element in response["data"]:
                externalized = externalize(element)
                for i in externalized:
                    if not i.data.get("deleted") and not i in all_objects:
                        objects.update(externalized)

            next_url = response["links"].get("next")

            # We can't have the that block outside the loop due to mysql's max_allowed_packet, manifesting
            # "MySQL server has gone away" https://stackoverflow.com/a/36637118/3549270
            # We'll be able to solve this a lot better after the django 2.2 update with ignore_conflicts
            try:
                # Also avoid "MySQL server has gone away" errors due to timeouts
                # https://stackoverflow.com/a/32720475/3549270
                db.close_old_connections()
                # The test are run with sqlite, which failed here with a TransactionManagementError:
                # "An error occurred in the current transaction. You can't execute queries until the end of the 'atomic' block."
                # That's why we build our own atomic block
                if settings.TESTING:
                    with transaction.atomic():
                        saved_objects = CachedObject.objects.bulk_create(
                            objects)
                else:
                    saved_objects = CachedObject.objects.bulk_create(objects)
            except IntegrityError:
                saved_objects = set()
                for i in objects:
                    defaults = {
                        "data": i.data,
                        "to_import": True,
                        "oparl_type": i.oparl_type,
                    }
                    saved_objects.add(
                        CachedObject.objects.update_or_create(
                            url=i.url, defaults=defaults)[0])

            all_objects.update(saved_objects)
        logger.info("Found {} objects in {}".format(len(all_objects), url))
        ExternalList(url=url, last_update=timestamp).save()

    def fetch_list_update(self, url: str) -> List[str]:
        """ Saves a complete external list as flattened json to the database """
        fetch_later = []

        timestamp = timezone.now()
        external_list = ExternalList.objects.get(url=url)
        logger.info("Last modified for {}: {}".format(
            url, external_list.last_update.isoformat()))
        # There must not be microseconds in the query datetimes
        # (Wuppertal rejects that and it's not standard compliant)
        modified_since_query = {
            "modified_since":
            external_list.last_update.replace(microsecond=0).isoformat()
        }
        next_url = url
        while next_url:
            # Handles both the case where modified_since is given with
            # the next url and where it isn't
            if "modified_since" in parse_qs(urlparse(next_url).query):
                response = self.loader.load(next_url)
            else:
                response = self.loader.load(next_url, modified_since_query)
            for element in response["data"]:
                fetch_later += self._process_element(element)

            next_url = response["links"].get("next")

        external_list.last_update = timestamp
        external_list.save()

        return fetch_later

    def is_url(self, value: Any) -> bool:
        if not isinstance(value, str):
            return False

        try:
            URLValidator()(value)
            return True
        except ValidationError:
            return False

    def _process_element(self, element: JSON) -> List[str]:
        keys_of_interest = set()  # type: Set[str]
        new = list(externalize(element, keys_of_interest))
        # Find the ids of removed embedded objects
        # This way is not elegant, but it gets the job done.
        old_element = CachedObject.objects.filter(url=element["id"]).first()
        old_urls = set()
        if old_element:
            for key in keys_of_interest:
                if isinstance(old_element.data.get(key), list):
                    old_urls.update(old_element.data[key])
                elif isinstance(old_element.data.get(key), str):
                    old_urls.add(old_element.data[key])

        removed = old_urls - set([i.url for i in new])
        fetch_later = CachedObject.objects.filter(url__in=removed).values_list(
            "url", flat=True)
        for instance in new:
            existing = CachedObject.objects.filter(url=instance.url).first()
            if existing:
                if existing.data == instance.data:
                    continue
                else:
                    existing.data = instance.data
                    existing.to_import = True
                    existing.save()
            else:
                instance.save()
        return fetch_later

    def update(self, body_id: str) -> None:
        fetch_later = self.fetch_list_update(self.loader.system["body"])

        # We only want to import a single body, so we mark the others as already imported
        CachedObject.objects.filter(
            to_import=True,
            oparl_type="Body").exclude(url=body_id).update(to_import=False)

        self.import_bodies(update=True)

        bodies = CachedObject.objects.filter(url=body_id).all()
        for body_entry in bodies:
            for list_type in self.lists:
                fetch_later += self.fetch_list_update(
                    body_entry.data[list_type])

        logger.info("Importing {} removed embedded objects".format(
            len(fetch_later)))
        for later in fetch_later:
            # We might actually have that object freshly from somewhere else
            fresh = CachedObject.objects.filter(url=later,
                                                to_import=True).exists()

            if not fresh:
                data = self.loader.load(later)
                CachedObject.objects.filter(url=later).update(
                    data=data,
                    oparl_type=data["type"].split("/")[-1],
                    to_import=True)

        self.import_objects(update=True)

    def download_and_analyze_file(self, file_id: int,
                                  address_pipeline: AddressPipeline,
                                  fallback_city: str) -> bool:
        """
        Downloads and analyses a single file, i.e. extracting text, locations and persons.

        Returns False for http errors on downloading and True otherwise.
        """
        file = File.objects.get(id=file_id)
        url = file.get_oparl_url()

        with NamedTemporaryFile() as tmpfile:
            try:
                content, content_type = self.loader.load_file(url)
                if content_type and file.mime_type and content_type != file.mime_type:
                    logger.warning(
                        "Diverging mime types: Expected {}, got {}".format(
                            file.mime_type, content_type))
                file.mime_type = content_type or file.mime_type
                tmpfile.write(content)
                tmpfile.file.seek(0)
                file.filesize = len(content)
            except RequestException:
                logger.exception("File {}: Failed to download {}".format(
                    file.id, url))
                return False

            logger.debug("File {}: Downloaded {} ({}, {})".format(
                file.id, url, file.mime_type, filesizeformat(file.filesize)))

            if not settings.PROXY_ONLY_TEMPLATE:
                minio_client().put_object(
                    minio_file_bucket,
                    str(file.id),
                    tmpfile.file,
                    file.filesize,
                    content_type=file.mime_type,
                )

            # If the api has text, keep that
            if self.download_files and not file.parsed_text:
                file.parsed_text, file.page_count = extract_from_file(
                    tmpfile.file, tmpfile.name, file.mime_type, file.id)

        if file.parsed_text:
            locations = extract_locations(file.parsed_text,
                                          pipeline=address_pipeline,
                                          fallback_city=fallback_city)
            file.locations.set(locations)
            persons = extract_persons(file.name + "\n" +
                                      (file.parsed_text or "") + "\n")
            file.mentioned_persons.set(persons)
            logger.debug("File {}: Found {} locations and {} persons".format(
                file.id, len(locations), len(persons)))
        else:
            logger.warning("File {}: Couldn't get any text".format(file.id))

        db.connections.close_all()
        file.save()

        return True

    def load_files(self,
                   fallback_city: str,
                   max_workers: Optional[int] = None) -> Tuple[int, int]:
        """Downloads and analyses the actual file for the file entries in the database.

        Returns the number of successful and failed files"""
        # This is partially bound by waiting on external resources, but mostly very cpu intensive,
        # so we can spawn a bunch of processes to make this a lot faster.
        # We need to build a list because mysql connections and process pools don't pair well.
        files = list(
            File.objects.filter(
                filesize__isnull=True,
                oparl_access_url__isnull=False).order_by("-id").values_list(
                    "id", flat=True))
        logger.info("Downloading and analysing {} files".format(len(files)))
        address_pipeline = AddressPipeline(create_geoextract_data())
        pbar = None
        if sys.stdout.isatty() and not settings.TESTING:
            pbar = tqdm(total=len(files))
        failed = 0
        successful = 0

        if not self.force_singlethread:
            # We need to close the database connections, which will be automatically reopen for
            # each process
            # See https://stackoverflow.com/a/10684672/3549270
            # and https://brobin.me/blog/2017/05/mutiprocessing-in-python-django-management-commands/
            db.connections.close_all()

            with ProcessPoolExecutor(max_workers=max_workers) as executor:
                for succeeded in executor.map(
                        self.download_and_analyze_file,
                        files,
                        repeat(address_pipeline),
                        repeat(fallback_city),
                ):
                    if not succeeded:
                        failed += 1
                    if pbar:
                        pbar.update()

        else:
            for file in files:
                succeeded = self.download_and_analyze_file(
                    file, address_pipeline, fallback_city)

                if not succeeded:
                    failed += 1
                else:
                    successful += 1

                if pbar:
                    pbar.update()
        if pbar:
            pbar.close()

        if failed > 0:
            logger.error("{} files failed to download".format(failed))

        return successful, failed