def download_file(self, file: File, libobject: OParl.File):
        """ Fix the invalid urls of sternberg oparl """
        url = libobject.get_download_url().replace(r"files//rim", r"files/rim")
        last_modified = self.glib_datetime_to_python(libobject.get_modified())

        if file.filesize and file.filesize > 0 and file.modified and last_modified and last_modified < file.modified:
            self.logger.info("Skipping cached Download: {}".format(url))
            return

        logging.info("Downloading {}".format(url))

        urlhash = hashlib.sha1(libobject.get_id().encode("utf-8")).hexdigest()
        path = os.path.join(self.storagefolder, urlhash)

        r = requests.get(url, allow_redirects=True)
        try:
            r.raise_for_status()
        except HTTPError as err:
            self.logger.error(err)
            file.storage_filename = "Error downloading File"
            file.filesize = -1
            return

        open(path, 'wb').write(r.content)

        file.filesize = os.stat(path).st_size
        file.storage_filename = urlhash
 def extract_text_from_file(self, file: File):
     path = os.path.join(self.storagefolder, file.storage_filename)
     if file.mime_type == "application/pdf":
         self.logger.info("Extracting text from PDF: " + path)
         try:
             text = extract_text_from_pdf(path, self.cachefolder)
             file.parsed_text = text
         except PDFTextExtractionNotAllowed:
             message = "The pdf {} is encrypted".format(path)
             self.errorlist.append(message)
     elif file.mime_type == "text/text":
         with open(path) as f:
             file.parsed_text = f.read()
Beispiel #3
0
 def parse_file(self, file: File, fallback_city: str):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     with minio_client().get_object(minio_file_bucket,
                                    str(file.id)) as file_handle:
         recognized_text = get_ocr_text_from_pdf(file_handle.read())
     if len(recognized_text) > 0:
         file.parsed_text = cleanup_extracted_text(recognized_text)
         file.mentioned_persons = extract_persons(file.name + "\n" +
                                                  (recognized_text or "") +
                                                  "\n")
         file.locations.set(
             extract_locations(file.parsed_text, fallback_city))
         file.save()
     else:
         logging.warning("Nothing recognized")
Beispiel #4
0
 def parse_file(self, file: File):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     file_path = os.path.abspath(os.path.dirname(__name__))
     file_path = os.path.join(file_path, settings.MEDIA_ROOT,
                              file.storage_filename)
     recognized_text = get_ocr_text_from_pdf(file_path)
     if len(recognized_text) > 0:
         file.parsed_text = cleanup_extracted_text(recognized_text)
         file.mentioned_persons = extract_persons(file.name + "\n" +
                                                  (recognized_text or "") +
                                                  "\n")
         file.locations = extract_locations(recognized_text)
         file.save()
     else:
         logging.warning("Nothing recognized")
    def test_file(self):
        file = File()
        data = self.api_data["https://oparl.example.org/files/0"]
        self.converter.file(data, file)
        self.assertEqual(file.filename, "anlage_1_zur_anfrage.pdf")
        self.assertEqual(file.mime_type, "application/pdf")
        legal_date = date(year=2013, month=1, day=4)
        self.assertEqual(file.legal_date, legal_date)
        self.assertEqual(file.sort_date,
                         self.utils.date_to_datetime(legal_date))
        self.assertEqual(file.filesize, None)
        self.assertEqual(file.page_count, None)
        self.assertEqual(file.parsed_text, None)
        self.assertEqual(file.license,
                         "http://www.opendefinition.org/licenses/cc-by")
        self.assertEqual(file.oparl_access_url,
                         "https://oparl.example.org/files/0.pdf")
        self.assertEqual(
            file.oparl_download_url,
            "https://oparl.example.org/files/download/57737.pdf",
        )

        data["text"] = "Lorem ipsum"
        self.converter.file(data, file)
        self.assertEqual(file.mime_type, "application/pdf")
        self.assertEqual(file.parsed_text, "Lorem ipsum")
 def extract_text_from_file(self, file: File):
     path = os.path.join(self.storagefolder, file.storage_filename)
     parsed_text = None
     if file.mime_type == "application/pdf":
         self.logger.info("Extracting text from PDF: " + path)
         try:
             parsed_text = extract_text_from_pdf(path)
             file.page_count = get_page_count_from_pdf(path)
         except Exception as e:
             message = "Could not parse pdf file {}: {}".format(path, e)
             self.logger.error(message)
             self.errorlist.append(message)
     elif file.mime_type == "text/text":
         with open(path) as f:
             parsed_text = f.read()
     file.parsed_text = parsed_text
     return parsed_text
    def download_file(self, file: File, libobject: OParl.File):
        url = libobject.get_download_url() or libobject.get_access_url()
        last_modified = self.glib_datetime_to_python(libobject.get_modified())

        if file.filesize and file.filesize > 0 and file.modified and last_modified and last_modified < file.modified:
            self.logger.info("Skipping cached Download: {}".format(url))
            return

        self.logger.info("Downloading {}".format(url))

        urlhash = hashlib.sha1(libobject.get_id().encode("utf-8")).hexdigest()
        path = os.path.join(self.storagefolder, urlhash)

        r = requests.get(url, allow_redirects=True)
        r.raise_for_status()
        open(path, 'wb').write(r.content)

        file.filesize = os.stat(path).st_size
        file.storage_filename = urlhash
Beispiel #8
0
 def parse_file(self, file: File, fallback_city: str):
     self.stdout.write("Parsing: " + str(file.id) + " (" + file.name + ")")
     locations = extract_locations(file.parsed_text, fallback_city)
     self.stdout.write("{} locations found".format(len(locations)))
     file.locations.set(locations)
     file.save()
Beispiel #9
0
 def parse_file(self, file: File):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     file.mentioned_persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n")
     file.save()
    def file(self, lib_object: JSON, file: File) -> File:
        cutoff = self.utils.filename_length_cutoff
        if lib_object.get("fileName"):
            filename = lib_object.get("fileName")
        elif lib_object.get("name"):
            extension = mimetypes.guess_extension("application/pdf") or ""
            length = cutoff - len(extension)
            filename = slugify(lib_object.get("name"))[:length] + extension
        else:
            access_url = lib_object["accessUrl"]
            filename = slugify(access_url.split("/")[-1])[-cutoff:]

        file.name = lib_object.get("name", "")
        if len(file.name) > 200:
            file.name = textwrap.wrap(file.name, 199)[0] + "\u2026"

        file.filename = filename
        file.mime_type = lib_object.get("mimeType") or "application/octet-stream"
        file.legal_date = self.utils.parse_date(lib_object.get("date"))
        file.sort_date = (
            self.utils.date_to_datetime(file.legal_date)
            or self.utils.parse_datetime(lib_object.get("created"))
            or timezone.now()
        )
        file.oparl_access_url = lib_object.get("accessUrl")
        file.oparl_download_url = lib_object.get("downloadUrl")
        file.filesize = None
        file.parsed_text = lib_object.get("text")
        file.license = lib_object.get("fileLicense")

        # We current do not handle locations attached to files due
        # to the lack of data and our own location extraction

        return file