def extract_text_from_file(self, file: File):
     path = os.path.join(self.storagefolder, file.storage_filename)
     if file.mime_type == "application/pdf":
         self.logger.info("Extracting text from PDF: " + path)
         try:
             text = extract_text_from_pdf(path, self.cachefolder)
             file.parsed_text = text
         except PDFTextExtractionNotAllowed:
             message = "The pdf {} is encrypted".format(path)
             self.errorlist.append(message)
     elif file.mime_type == "text/text":
         with open(path) as f:
             file.parsed_text = f.read()
    def file(self, lib_object: JSON, file: File) -> File:
        cutoff = self.utils.filename_length_cutoff
        if lib_object.get("fileName"):
            filename = lib_object.get("fileName")
        elif lib_object.get("name"):
            extension = mimetypes.guess_extension("application/pdf") or ""
            length = cutoff - len(extension)
            filename = slugify(lib_object.get("name"))[:length] + extension
        else:
            access_url = lib_object["accessUrl"]
            filename = slugify(access_url.split("/")[-1])[-cutoff:]

        file.name = lib_object.get("name", "")
        if len(file.name) > 200:
            file.name = textwrap.wrap(file.name, 199)[0] + "\u2026"

        file.filename = filename
        file.mime_type = lib_object.get("mimeType") or "application/octet-stream"
        file.legal_date = self.utils.parse_date(lib_object.get("date"))
        file.sort_date = (
            self.utils.date_to_datetime(file.legal_date)
            or self.utils.parse_datetime(lib_object.get("created"))
            or timezone.now()
        )
        file.oparl_access_url = lib_object.get("accessUrl")
        file.oparl_download_url = lib_object.get("downloadUrl")
        file.filesize = None
        file.parsed_text = lib_object.get("text")
        file.license = lib_object.get("fileLicense")

        # We current do not handle locations attached to files due
        # to the lack of data and our own location extraction

        return file
Exemple #3
0
 def parse_file(self, file: File, fallback_city: str):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     with minio_client().get_object(minio_file_bucket,
                                    str(file.id)) as file_handle:
         recognized_text = get_ocr_text_from_pdf(file_handle.read())
     if len(recognized_text) > 0:
         file.parsed_text = cleanup_extracted_text(recognized_text)
         file.mentioned_persons = extract_persons(file.name + "\n" +
                                                  (recognized_text or "") +
                                                  "\n")
         file.locations.set(
             extract_locations(file.parsed_text, fallback_city))
         file.save()
     else:
         logging.warning("Nothing recognized")
Exemple #4
0
 def parse_file(self, file: File):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     file_path = os.path.abspath(os.path.dirname(__name__))
     file_path = os.path.join(file_path, settings.MEDIA_ROOT,
                              file.storage_filename)
     recognized_text = get_ocr_text_from_pdf(file_path)
     if len(recognized_text) > 0:
         file.parsed_text = cleanup_extracted_text(recognized_text)
         file.mentioned_persons = extract_persons(file.name + "\n" +
                                                  (recognized_text or "") +
                                                  "\n")
         file.locations = extract_locations(recognized_text)
         file.save()
     else:
         logging.warning("Nothing recognized")
 def extract_text_from_file(self, file: File):
     path = os.path.join(self.storagefolder, file.storage_filename)
     parsed_text = None
     if file.mime_type == "application/pdf":
         self.logger.info("Extracting text from PDF: " + path)
         try:
             parsed_text = extract_text_from_pdf(path)
             file.page_count = get_page_count_from_pdf(path)
         except Exception as e:
             message = "Could not parse pdf file {}: {}".format(path, e)
             self.logger.error(message)
             self.errorlist.append(message)
     elif file.mime_type == "text/text":
         with open(path) as f:
             parsed_text = f.read()
     file.parsed_text = parsed_text
     return parsed_text