def parse_file(self, file: File, fallback_city: str): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") with minio_client().get_object(minio_file_bucket, str(file.id)) as file_handle: recognized_text = get_ocr_text_from_pdf(file_handle.read()) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations.set( extract_locations(file.parsed_text, fallback_city)) file.save() else: logging.warning("Nothing recognized")
def parse_file(self, file: File): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") file_path = os.path.abspath(os.path.dirname(__name__)) file_path = os.path.join(file_path, settings.MEDIA_ROOT, file.storage_filename) recognized_text = get_ocr_text_from_pdf(file_path) if len(recognized_text) > 0: file.parsed_text = cleanup_extracted_text(recognized_text) file.mentioned_persons = extract_persons(file.name + "\n" + (recognized_text or "") + "\n") file.locations = extract_locations(recognized_text) file.save() else: logging.warning("Nothing recognized")
def parse_file(self, file: File, fallback_city: str): self.stdout.write("Parsing: " + str(file.id) + " (" + file.name + ")") locations = extract_locations(file.parsed_text, fallback_city) self.stdout.write("{} locations found".format(len(locations))) file.locations.set(locations) file.save()
def parse_file(self, file: File): logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")") file.mentioned_persons = extract_persons(file.name + "\n" + (file.parsed_text or "") + "\n") file.save()