def test_file_extensions(self): for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]: self.assertIn(ext, get_supported_file_extensions()) self.assertEqual(get_default_file_extension('application/pdf'), ".pdf") self.assertEqual(get_default_file_extension('image/png'), ".png") self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg") self.assertEqual(get_default_file_extension('text/plain'), ".txt") self.assertEqual(get_default_file_extension('text/csv'), ".csv") self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), None) self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser) self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser) self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
def move_old_to_new_locations(apps, schema_editor): Document = apps.get_model("documents", "Document") affected_document_ids = set() old_archive_path_to_id = {} # check for documents that have incorrect archive versions for doc in Document.objects.filter(archive_checksum__isnull=False): old_path = archive_path_old(doc) if old_path in old_archive_path_to_id: affected_document_ids.add(doc.id) affected_document_ids.add(old_archive_path_to_id[old_path]) else: old_archive_path_to_id[old_path] = doc.id # check that archive files of all unaffected documents are in place for doc in Document.objects.filter(archive_checksum__isnull=False): old_path = archive_path_old(doc) if doc.id not in affected_document_ids and not os.path.isfile( old_path): raise ValueError( f"Archived document ID:{doc.id} does not exist at: " f"{old_path}") # check that we can regenerate affected archive versions for doc_id in affected_document_ids: from documents.parsers import get_parser_class_for_mime_type doc = Document.objects.get(id=doc_id) parser_class = get_parser_class_for_mime_type(doc.mime_type) if not parser_class: raise ValueError( f"Document ID:{doc.id} has an invalid archived document, " f"but no parsers are available. Cannot migrate.") for doc in Document.objects.filter(archive_checksum__isnull=False): if doc.id in affected_document_ids: old_path = archive_path_old(doc) # remove affected archive versions if os.path.isfile(old_path): logger.debug(f"Removing {old_path}") os.unlink(old_path) else: # Set archive path for unaffected files doc.archive_filename = archive_name_from_filename(doc.filename) Document.objects.filter(id=doc.id).update( archive_filename=doc.archive_filename) # regenerate archive documents for doc_id in affected_document_ids: doc = Document.objects.get(id=doc_id) create_archive_version(doc)
def test_file_extensions(self): for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]: self.assertIn(ext, get_supported_file_extensions()) self.assertEqual(get_default_file_extension('application/pdf'), ".pdf") self.assertEqual(get_default_file_extension('image/png'), ".png") self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg") self.assertEqual(get_default_file_extension('text/plain'), ".txt") self.assertEqual(get_default_file_extension('text/csv'), ".csv") self.assertEqual(get_default_file_extension('application/zip'), ".zip") self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "") self.assertIsInstance( get_parser_class_for_mime_type('application/pdf')( logging_group=None), RasterisedDocumentParser) self.assertIsInstance( get_parser_class_for_mime_type('text/plain')(logging_group=None), TextDocumentParser) self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None) self.assertTrue(is_file_ext_supported('.pdf')) self.assertFalse(is_file_ext_supported('.hsdfh')) self.assertFalse(is_file_ext_supported(''))
def create_archive_version(doc, retry_count=3): from documents.parsers import get_parser_class_for_mime_type, \ DocumentParser, \ ParseError logger.info(f"Regenerating archive document for document ID:{doc.id}") parser_class = get_parser_class_for_mime_type(doc.mime_type) for try_num in range(retry_count): parser: DocumentParser = parser_class(None, None) try: parse_wrapper(parser, source_path(doc), doc.mime_type, os.path.basename(doc.filename)) doc.content = parser.get_text() if parser.get_archive_path() and os.path.isfile( parser.get_archive_path()): doc.archive_filename = generate_unique_filename( doc, archive_filename=True) with open(parser.get_archive_path(), "rb") as f: doc.archive_checksum = hashlib.md5(f.read()).hexdigest() os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True) shutil.copy2(parser.get_archive_path(), archive_path_new(doc)) else: doc.archive_checksum = None logger.error( f"Parser did not return an archive document for document " f"ID:{doc.id}. Removing archive document.") doc.save() return except ParseError: if try_num + 1 == retry_count: logger.exception( f"Unable to regenerate archive document for ID:{doc.id}. You " f"need to invoke the document_archiver management command " f"manually for that document.") doc.archive_checksum = None doc.save() return else: # This is mostly here for the tika parser in docker # environemnts. The servers for parsing need to come up first, # and the docker setup doesn't ensure that tika is running # before attempting migrations. logger.error("Parse error, will try again in 5 seconds...") sleep(5) finally: parser.cleanup()