def download_file(self, file: File, libobject: OParl.File): """ Fix the invalid urls of sternberg oparl """ url = libobject.get_download_url().replace(r"files//rim", r"files/rim") last_modified = self.glib_datetime_to_python(libobject.get_modified()) if file.filesize and file.filesize > 0 and file.modified and last_modified and last_modified < file.modified: self.logger.info("Skipping cached Download: {}".format(url)) return logging.info("Downloading {}".format(url)) urlhash = hashlib.sha1(libobject.get_id().encode("utf-8")).hexdigest() path = os.path.join(self.storagefolder, urlhash) r = requests.get(url, allow_redirects=True) try: r.raise_for_status() except HTTPError as err: self.logger.error(err) file.storage_filename = "Error downloading File" file.filesize = -1 return open(path, 'wb').write(r.content) file.filesize = os.stat(path).st_size file.storage_filename = urlhash
def extract_text_from_file(self, file: File): path = os.path.join(self.storagefolder, file.storage_filename) if file.mime_type == "application/pdf": self.logger.info("Extracting text from PDF: " + path) try: text = extract_text_from_pdf(path, self.cachefolder) file.parsed_text = text except PDFTextExtractionNotAllowed: message = "The pdf {} is encrypted".format(path) self.errorlist.append(message) elif file.mime_type == "text/text": with open(path) as f: file.parsed_text = f.read()
def parse_file(self, file: File, fallback_city: str): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") with minio_client().get_object(minio_file_bucket, str(file.id)) as file_handle: recognized_text = get_ocr_text_from_pdf(file_handle.read()) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations.set( extract_locations(file.parsed_text, fallback_city)) file.save() else: logging.warning("Nothing recognized")
def parse_file(self, file: File): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") file_path = os.path.abspath(os.path.dirname(__name__)) file_path = os.path.join(file_path, settings.MEDIA_ROOT, file.storage_filename) recognized_text = get_ocr_text_from_pdf(file_path) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations = extract_locations(recognized_text) file.save() else: logging.warning("Nothing recognized")
def test_file(self): file = File() data = self.api_data["https://oparl.example.org/files/0"] self.converter.file(data, file) self.assertEqual(file.filename, "anlage_1_zur_anfrage.pdf") self.assertEqual(file.mime_type, "application/pdf") legal_date = date(year=2013, month=1, day=4) self.assertEqual(file.legal_date, legal_date) self.assertEqual(file.sort_date, self.utils.date_to_datetime(legal_date)) self.assertEqual(file.filesize, None) self.assertEqual(file.page_count, None) self.assertEqual(file.parsed_text, None) self.assertEqual(file.license, "http://www.opendefinition.org/licenses/cc-by") self.assertEqual(file.oparl_access_url, "https://oparl.example.org/files/0.pdf") self.assertEqual( file.oparl_download_url, "https://oparl.example.org/files/download/57737.pdf", ) data["text"] = "Lorem ipsum" self.converter.file(data, file) self.assertEqual(file.mime_type, "application/pdf") self.assertEqual(file.parsed_text, "Lorem ipsum")
def extract_text_from_file(self, file: File): path = os.path.join(self.storagefolder, file.storage_filename) parsed_text = None if file.mime_type == "application/pdf": self.logger.info("Extracting text from PDF: " + path) try: parsed_text = extract_text_from_pdf(path) file.page_count = get_page_count_from_pdf(path) except Exception as e: message = "Could not parse pdf file {}: {}".format(path, e) self.logger.error(message) self.errorlist.append(message) elif file.mime_type == "text/text": with open(path) as f: parsed_text = f.read() file.parsed_text = parsed_text return parsed_text
def download_file(self, file: File, libobject: OParl.File): url = libobject.get_download_url() or libobject.get_access_url() last_modified = self.glib_datetime_to_python(libobject.get_modified()) if file.filesize and file.filesize > 0 and file.modified and last_modified and last_modified < file.modified: self.logger.info("Skipping cached Download: {}".format(url)) return self.logger.info("Downloading {}".format(url)) urlhash = hashlib.sha1(libobject.get_id().encode("utf-8")).hexdigest() path = os.path.join(self.storagefolder, urlhash) r = requests.get(url, allow_redirects=True) r.raise_for_status() open(path, 'wb').write(r.content) file.filesize = os.stat(path).st_size file.storage_filename = urlhash
def parse_file(self, file: File, fallback_city: str): self.stdout.write("Parsing: " + str(file.id) + " (" + file.name + ")") locations = extract_locations(file.parsed_text, fallback_city) self.stdout.write("{} locations found".format(len(locations))) file.locations.set(locations) file.save()
def parse_file(self, file: File): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") file.mentioned_persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.save()
def file(self, lib_object: JSON, file: File) -> File: cutoff = self.utils.filename_length_cutoff if lib_object.get("fileName"): filename = lib_object.get("fileName") elif lib_object.get("name"): extension = mimetypes.guess_extension("application/pdf") or "" length = cutoff - len(extension) filename = slugify(lib_object.get("name"))[:length] + extension else: access_url = lib_object["accessUrl"] filename = slugify(access_url.split("/")[-1])[-cutoff:] file.name = lib_object.get("name", "") if len(file.name) > 200: file.name = textwrap.wrap(file.name, 199)[0] + "\u2026" file.filename = filename file.mime_type = lib_object.get("mimeType") or "application/octet-stream" file.legal_date = self.utils.parse_date(lib_object.get("date")) file.sort_date = ( self.utils.date_to_datetime(file.legal_date) or self.utils.parse_datetime(lib_object.get("created")) or timezone.now() ) file.oparl_access_url = lib_object.get("accessUrl") file.oparl_download_url = lib_object.get("downloadUrl") file.filesize = None file.parsed_text = lib_object.get("text") file.license = lib_object.get("fileLicense") # We current do not handle locations attached to files due # to the lack of data and our own location extraction return file