Beispiel #1
0
    def download_and_analyze_file(self, file_id: int,
                                  address_pipeline: AddressPipeline,
                                  fallback_city: str) -> bool:
        """
        Downloads and analyses a single file, i.e. extracting text, locations and persons.

        Returns False for http errors on downloading and True otherwise.
        """
        file = File.objects.get(id=file_id)
        url = file.get_oparl_url()

        with NamedTemporaryFile() as tmpfile:
            try:
                content, content_type = self.loader.load_file(url)
                if content_type and file.mime_type and content_type != file.mime_type:
                    logger.warning(
                        "Diverging mime types: Expected {}, got {}".format(
                            file.mime_type, content_type))
                file.mime_type = content_type or file.mime_type
                tmpfile.write(content)
                tmpfile.file.seek(0)
                file.filesize = len(content)
            except HTTPError:
                logger.exception("File {}: Failed to download {}".format(
                    file.id, url))
                return False

            logger.debug("File {}: Downloaded {} ({}, {})".format(
                file.id, url, file.mime_type, filesizeformat(file.filesize)))

            minio_client().put_object(
                minio_file_bucket,
                str(file.id),
                tmpfile.file,
                file.filesize,
                content_type=file.mime_type,
            )

            # If the api has text, keep that
            if self.download_files and not file.parsed_text:
                file.parsed_text, file.page_count = extract_from_file(
                    tmpfile.file, tmpfile.name, file.mime_type, file.id)

        if file.parsed_text:
            locations = extract_locations(file.parsed_text,
                                          pipeline=address_pipeline,
                                          fallback_city=fallback_city)
            file.locations.set(locations)
            persons = extract_persons(file.name + "\n" +
                                      (file.parsed_text or "") + "\n")
            file.mentioned_persons.set(persons)
            logger.debug("File {}: Found {} locations and {} persons".format(
                file.id, len(locations), len(persons)))
        else:
            logger.warning("File {}: Couldn't get any text".format(file.id))

        db.connections.close_all()
        file.save()

        return True
Beispiel #2
0
 def test_pdf_parsing(self):
     file = os.path.join(test_media_root,
                         "Donald Knuth - The Complexity of Songs.pdf")
     with open(file, "rb") as fp:
         parsed_text, page_count = extract_from_file(
             fp, file, "application/pdf", 0)
     self.assertTrue("bottles of beer" in parsed_text)
     self.assertEqual(page_count, 3)
def test_pdf_parsing(pytestconfig, caplog):
    file = pytestconfig.rootpath.joinpath(test_media_root).joinpath(
        "Donald Knuth - The Complexity of Songs.pdf")

    with file.open("rb") as fp:
        parsed_text, page_count = extract_from_file(fp, file,
                                                    "application/pdf", 0)
    assert caplog.messages == []
    assert "bottles of beer" in parsed_text
    assert page_count == 3
def test_pdf_parsing_oom(pytestconfig, caplog):
    """Check error handling when pdftotext tries to use more than the allowed memory"""
    file = pytestconfig.rootpath.joinpath(test_media_root).joinpath(
        "Donald Knuth - The Complexity of Songs.pdf")

    with file.open("rb") as fp:
        parsed_text, page_count = extract_from_file(fp, file,
                                                    "application/pdf", 0)
    assert caplog.messages == [
        "File 0: Failed to run pdftotext: Command '['pdftotext', "
        f"PosixPath('{file}'), '-']' returned non-zero exit status 127."
    ]
    assert parsed_text is None
    assert page_count == 3
def test_pdf_as_tiff(pytestconfig, caplog, filename):
    """A tiff tagged as pdf, making PyPDF2 fail

    https://github.com/codeformuenster/kubernetes-deployment/pull/65#issuecomment-894232803"""
    file = pytestconfig.rootpath.joinpath("testdata/media").joinpath(filename)
    with file.open("rb") as fp:
        parsed_text, page_count = extract_from_file(fp, file,
                                                    "application/pdf", 0)
    assert caplog.messages == [
        "File 0: Failed to run pdftotext: Command '['pdftotext', "
        f"PosixPath('{file}'), '-']' returned non-zero exit status 1.",
        "File 0: Pdf does not allow to read the number of pages",
    ]
    assert not parsed_text
    assert not page_count
Beispiel #6
0
    def download_and_analyze_file(self, file_id: int,
                                  address_pipeline: AddressPipeline,
                                  fallback_city: str) -> bool:
        """
        Downloads and analyses a single file, i.e. extracting text, locations and persons.

        Returns False for http errors on downloading and True otherwise.
        """
        file = File.objects.get(id=file_id)
        url = file.get_oparl_url()

        with NamedTemporaryFile() as tmp_file:
            try:
                content, content_type = self.loader.load_file(url)
                if content_type and file.mime_type and content_type != file.mime_type:
                    logger.warning(
                        "Diverging mime types: Expected {}, got {}".format(
                            file.mime_type, content_type))
                if content_type and content_type.split(";")[0] == "text/html":
                    logger.error(
                        f"File {file.id}: Content type was {content_type}, this seems to be a silent error"
                    )
                    return False
                file.mime_type = content_type or file.mime_type
                tmp_file.write(content)
                tmp_file.file.seek(0)
                file.filesize = len(content)
            except RequestException as e:
                # Normal server error
                if e.response and 400 <= e.response.status_code < 600:
                    logger.error(
                        f"File {file.id}: Failed to download {url} with error {e.response.status_code}"
                    )
                else:
                    logger.exception(
                        f"File {file.id}: Failed to download {url}")
                return False

            logger.debug("File {}: Downloaded {} ({}, {})".format(
                file.id, url, file.mime_type, filesizeformat(file.filesize)))

            if not settings.PROXY_ONLY_TEMPLATE:
                minio_client().put_object(
                    minio_file_bucket,
                    str(file.id),
                    tmp_file.file,
                    file.filesize,
                    content_type=file.mime_type,
                )

            # If the api has text, keep that
            if self.download_files and not file.parsed_text:
                file.parsed_text, file.page_count = extract_from_file(
                    tmp_file.file, tmp_file.name, file.mime_type, file.id)

        if file.parsed_text:
            locations = extract_locations(file.parsed_text,
                                          pipeline=address_pipeline,
                                          fallback_city=fallback_city)
            file.locations.set(locations)
            persons = extract_persons(file.name + "\n" +
                                      (file.parsed_text or "") + "\n")
            file.mentioned_persons.set(persons)
            logger.debug("File {}: Found {} locations and {} persons".format(
                file.id, len(locations), len(persons)))
        else:
            logger.warning(f"File {file.id}: Couldn't get any text")

        try:
            db.connections.close_all()
            file.save()
        except (ElasticsearchException, DatabaseError) as e:
            logger.exception(f"File {file.id}: Failed to save: {e}")
            return False

        return True