def extract_text_from_file(self, file: File): path = os.path.join(self.storagefolder, file.storage_filename) if file.mime_type == "application/pdf": self.logger.info("Extracting text from PDF: " + path) try: text = extract_text_from_pdf(path, self.cachefolder) file.parsed_text = text except PDFTextExtractionNotAllowed: message = "The pdf {} is encrypted".format(path) self.errorlist.append(message) elif file.mime_type == "text/text": with open(path) as f: file.parsed_text = f.read()
def file(self, lib_object: JSON, file: File) -> File: cutoff = self.utils.filename_length_cutoff if lib_object.get("fileName"): filename = lib_object.get("fileName") elif lib_object.get("name"): extension = mimetypes.guess_extension("application/pdf") or "" length = cutoff - len(extension) filename = slugify(lib_object.get("name"))[:length] + extension else: access_url = lib_object["accessUrl"] filename = slugify(access_url.split("/")[-1])[-cutoff:] file.name = lib_object.get("name", "") if len(file.name) > 200: file.name = textwrap.wrap(file.name, 199)[0] + "\u2026" file.filename = filename file.mime_type = lib_object.get("mimeType") or "application/octet-stream" file.legal_date = self.utils.parse_date(lib_object.get("date")) file.sort_date = ( self.utils.date_to_datetime(file.legal_date) or self.utils.parse_datetime(lib_object.get("created")) or timezone.now() ) file.oparl_access_url = lib_object.get("accessUrl") file.oparl_download_url = lib_object.get("downloadUrl") file.filesize = None file.parsed_text = lib_object.get("text") file.license = lib_object.get("fileLicense") # We current do not handle locations attached to files due # to the lack of data and our own location extraction return file
def parse_file(self, file: File, fallback_city: str): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") with minio_client().get_object(minio_file_bucket, str(file.id)) as file_handle: recognized_text = get_ocr_text_from_pdf(file_handle.read()) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations.set( extract_locations(file.parsed_text, fallback_city)) file.save() else: logging.warning("Nothing recognized")
def parse_file(self, file: File): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") file_path = os.path.abspath(os.path.dirname(__name__)) file_path = os.path.join(file_path, settings.MEDIA_ROOT, file.storage_filename) recognized_text = get_ocr_text_from_pdf(file_path) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations = extract_locations(recognized_text) file.save() else: logging.warning("Nothing recognized")
def extract_text_from_file(self, file: File): path = os.path.join(self.storagefolder, file.storage_filename) parsed_text = None if file.mime_type == "application/pdf": self.logger.info("Extracting text from PDF: " + path) try: parsed_text = extract_text_from_pdf(path) file.page_count = get_page_count_from_pdf(path) except Exception as e: message = "Could not parse pdf file {}: {}".format(path, e) self.logger.error(message) self.errorlist.append(message) elif file.mime_type == "text/text": with open(path) as f: parsed_text = f.read() file.parsed_text = parsed_text return parsed_text