Ejemplo n.º 1
0
    def download_and_analyze_file(self, file_id: int,
                                  address_pipeline: AddressPipeline,
                                  fallback_city: str) -> bool:
        """
        Downloads and analyses a single file, i.e. extracting text, locations and persons.

        Returns False for http errors on downloading and True otherwise.
        """
        file = File.objects.get(id=file_id)
        url = file.get_oparl_url()

        with NamedTemporaryFile() as tmpfile:
            try:
                content, content_type = self.loader.load_file(url)
                if content_type and file.mime_type and content_type != file.mime_type:
                    logger.warning(
                        "Diverging mime types: Expected {}, got {}".format(
                            file.mime_type, content_type))
                file.mime_type = content_type or file.mime_type
                tmpfile.write(content)
                tmpfile.file.seek(0)
                file.filesize = len(content)
            except HTTPError:
                logger.exception("File {}: Failed to download {}".format(
                    file.id, url))
                return False

            logger.debug("File {}: Downloaded {} ({}, {})".format(
                file.id, url, file.mime_type, filesizeformat(file.filesize)))

            minio_client().put_object(
                minio_file_bucket,
                str(file.id),
                tmpfile.file,
                file.filesize,
                content_type=file.mime_type,
            )

            # If the api has text, keep that
            if self.download_files and not file.parsed_text:
                file.parsed_text, file.page_count = extract_from_file(
                    tmpfile.file, tmpfile.name, file.mime_type, file.id)

        if file.parsed_text:
            locations = extract_locations(file.parsed_text,
                                          pipeline=address_pipeline,
                                          fallback_city=fallback_city)
            file.locations.set(locations)
            persons = extract_persons(file.name + "\n" +
                                      (file.parsed_text or "") + "\n")
            file.mentioned_persons.set(persons)
            logger.debug("File {}: Found {} locations and {} persons".format(
                file.id, len(locations), len(persons)))
        else:
            logger.warning("File {}: Couldn't get any text".format(file.id))

        db.connections.close_all()
        file.save()

        return True
    def test_person_extraction(self):
        frank = Person.objects.get(pk=1)
        doug = Person.objects.get(pk=4)
        will = Person.objects.get(pk=7)

        text = "A text \nabout Frank Underwood, Stamper, Doug, and a \nmisspelled WilliamConway."
        persons = extract_persons(text)
        self.assertTrue(doug in persons)
        self.assertTrue(frank in persons)
        self.assertFalse(will in persons)

        text = "Also the more formal name, \"Underwood, Francis\" should be found."
        persons = extract_persons(text)
        self.assertFalse(doug in persons)
        self.assertTrue(frank in persons)
        self.assertFalse(will in persons)
Ejemplo n.º 3
0
    def test_person_extraction(self):
        frank = Person.objects.get(pk=1)
        doug = Person.objects.get(pk=4)
        will = Person.objects.get(pk=7)

        text = "A text \nabout Frank Underwood, Stamper, Doug, and a \nmisspelled WilliamConway."
        persons = extract_persons(text)
        self.assertTrue(doug in persons)
        self.assertTrue(frank in persons)
        self.assertTrue(will not in persons)

        text = 'Also the more formal name, "Underwood, Francis" should be found.'
        persons = extract_persons(text)
        self.assertTrue(doug not in persons)
        self.assertTrue(frank in persons)
        self.assertTrue(will not in persons)

        text = "We should check word boundaries like Doug Stampering something."
        persons = extract_persons(text)
        self.assertTrue(doug not in persons)
Ejemplo n.º 4
0
 def parse_file(self, file: File, fallback_city: str):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     with minio_client().get_object(minio_file_bucket,
                                    str(file.id)) as file_handle:
         recognized_text = get_ocr_text_from_pdf(file_handle.read())
     if len(recognized_text) > 0:
         file.parsed_text = cleanup_extracted_text(recognized_text)
         file.mentioned_persons = extract_persons(file.name + "\n" +
                                                  (recognized_text or "") +
                                                  "\n")
         file.locations.set(
             extract_locations(file.parsed_text, fallback_city))
         file.save()
     else:
         logging.warning("Nothing recognized")
Ejemplo n.º 5
0
 def parse_file(self, file: File):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     file_path = os.path.abspath(os.path.dirname(__name__))
     file_path = os.path.join(file_path, settings.MEDIA_ROOT,
                              file.storage_filename)
     recognized_text = get_ocr_text_from_pdf(file_path)
     if len(recognized_text) > 0:
         file.parsed_text = cleanup_extracted_text(recognized_text)
         file.mentioned_persons = extract_persons(file.name + "\n" +
                                                  (recognized_text or "") +
                                                  "\n")
         file.locations = extract_locations(recognized_text)
         file.save()
     else:
         logging.warning("Nothing recognized")
    def file(self, libobject: OParl.File):
        file, do_update = self.check_for_modification(libobject, File)
        if not file or not do_update:
            return file
        self.logger.info("Processing File {}".format(libobject.get_id()))

        if libobject.get_file_name():
            displayed_filename = libobject.get_file_name()
        elif libobject.get_name():
            extension = mimetypes.guess_extension("application/pdf") or ""
            length = self.filename_length_cutoff - len(extension)
            displayed_filename = slugify(
                libobject.get_name())[:length] + extension
        else:
            displayed_filename = slugify(
                libobject.get_access_url())[-self.filename_length_cutoff:]

        parsed_text_before = file.parsed_text
        file_name_before = file.name

        file.oparl_id = libobject.get_id()
        file.name = libobject.get_name()
        file.displayed_filename = displayed_filename
        file.mime_type = libobject.get_mime_type(
        ) or "application/octet-stream"
        file.legal_date = self.glib_datetime_to_python_date(
            libobject.get_date())
        file.sort_date = file.created
        file.oparl_access_url = libobject.get_access_url()
        file.oparl_download_url = libobject.get_download_url()

        # If no text comes from the API, don't overwrite previously extracted PDF-content with an empty string
        if libobject.get_text():
            file.parsed_text = libobject.get_text()

        if self.download_files:
            self.download_file(file, libobject)
        else:
            file.storage_filename = ""
            file.filesize = -1

        parsed_text = file.parsed_text
        if file.storage_filename and not file.parsed_text:
            parsed_text = self.extract_text_from_file(file)

        file = self.call_custom_hook("sanitize_file", file)

        if len(file.name) > 200:
            file.name = textwrap.wrap(file.name, 199)[0] + "\u2026"

        file.save()

        if file_name_before != file.name or parsed_text_before != file.parsed_text:
            # These two operations are rather CPU-intensive, so we only perform them if something relevant has changed
            file.locations = extract_locations(parsed_text)
            file.mentioned_persons = extract_persons(file.name + "\n" +
                                                     (parsed_text or "") +
                                                     "\n")
            file.save()

        return file
Ejemplo n.º 7
0
    def download_and_analyze_file(self, file_id: int,
                                  address_pipeline: AddressPipeline,
                                  fallback_city: str) -> bool:
        """
        Downloads and analyses a single file, i.e. extracting text, locations and persons.

        Returns False for http errors on downloading and True otherwise.
        """
        file = File.objects.get(id=file_id)
        url = file.get_oparl_url()

        with NamedTemporaryFile() as tmp_file:
            try:
                content, content_type = self.loader.load_file(url)
                if content_type and file.mime_type and content_type != file.mime_type:
                    logger.warning(
                        "Diverging mime types: Expected {}, got {}".format(
                            file.mime_type, content_type))
                if content_type and content_type.split(";")[0] == "text/html":
                    logger.error(
                        f"File {file.id}: Content type was {content_type}, this seems to be a silent error"
                    )
                    return False
                file.mime_type = content_type or file.mime_type
                tmp_file.write(content)
                tmp_file.file.seek(0)
                file.filesize = len(content)
            except RequestException as e:
                # Normal server error
                if e.response and 400 <= e.response.status_code < 600:
                    logger.error(
                        f"File {file.id}: Failed to download {url} with error {e.response.status_code}"
                    )
                else:
                    logger.exception(
                        f"File {file.id}: Failed to download {url}")
                return False

            logger.debug("File {}: Downloaded {} ({}, {})".format(
                file.id, url, file.mime_type, filesizeformat(file.filesize)))

            if not settings.PROXY_ONLY_TEMPLATE:
                minio_client().put_object(
                    minio_file_bucket,
                    str(file.id),
                    tmp_file.file,
                    file.filesize,
                    content_type=file.mime_type,
                )

            # If the api has text, keep that
            if self.download_files and not file.parsed_text:
                file.parsed_text, file.page_count = extract_from_file(
                    tmp_file.file, tmp_file.name, file.mime_type, file.id)

        if file.parsed_text:
            locations = extract_locations(file.parsed_text,
                                          pipeline=address_pipeline,
                                          fallback_city=fallback_city)
            file.locations.set(locations)
            persons = extract_persons(file.name + "\n" +
                                      (file.parsed_text or "") + "\n")
            file.mentioned_persons.set(persons)
            logger.debug("File {}: Found {} locations and {} persons".format(
                file.id, len(locations), len(persons)))
        else:
            logger.warning(f"File {file.id}: Couldn't get any text")

        try:
            db.connections.close_all()
            file.save()
        except (ElasticsearchException, DatabaseError) as e:
            logger.exception(f"File {file.id}: Failed to save: {e}")
            return False

        return True
Ejemplo n.º 8
0
 def parse_file(self, file: File):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     file.mentioned_persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n")
     file.save()