def test_no_pageid_without_mets(self): f = OcrdFile(None) with self.assertRaisesRegex(Exception, ".*has no member 'mets' pointing.*"): print(f.pageId) with self.assertRaisesRegex(Exception, ".*has no member 'mets' pointing.*"): f.pageId = 'foo'
def test_loctype(self): f = OcrdFile(None) self.assertEqual(f.loctype, 'OTHER') self.assertEqual(f.otherloctype, 'FILE') f.otherloctype = 'foo' self.assertEqual(f.otherloctype, 'foo') f.loctype = 'URN' self.assertEqual(f.loctype, 'URN') self.assertEqual(f.otherloctype, None) f.otherloctype = 'foo' self.assertEqual(f.loctype, 'OTHER')
def test_ocrd_file_eq(self): mets = OcrdMets.empty_mets() f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff') self.assertEqual(f1 == f1, True) self.assertEqual(f1 != f1, False) f2 = mets.add_file('FOO', ID='FOO_2', mimetype='image/tiff') self.assertEqual(f1 == f2, False) f3 = OcrdFile(None, ID='TEMP_1', mimetype='image/tiff') f4 = OcrdFile(None, ID='TEMP_1', mimetype='image/tif') # be tolerant of different equivalent mimetypes self.assertEqual(f3 == f4, True) f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff') self.assertEqual(f3 == f5, True)
def test_file_group_wo_parent_new_version(): """Test for new error message """ with pytest.raises( ValueError, match=r"Must provide mets:file element this OcrdFile represent"): OcrdFile(None)
def _resolve_image_as_pil(self, image_url, coords=None): """ Resolve an image URL to a PIL image. Args: - coords (list) : Coordinates of the bounding box to cut from the image Returns: Image or region in image as PIL.Image """ log = getLogger('ocrd.workspace._resolve_image_as_pil') files = self.mets.find_files(url=image_url) f = files[0] if files else OcrdFile(None, url=image_url) image_filename = self.download_file(f).local_filename with pushd_popd(self.directory): pil_image = Image.open(image_filename) pil_image.load() # alloc and give up the FD if coords is None: return pil_image log.debug("Converting PIL to OpenCV: %s", image_url) color_conversion = cv2.COLOR_GRAY2BGR if pil_image.mode in ( '1', 'L') else cv2.COLOR_RGB2BGR pil_as_np_array = np.array(pil_image).astype( 'uint8') if pil_image.mode == '1' else np.array(pil_image) cv2_image = cv2.cvtColor(pil_as_np_array, color_conversion) poly = np.array(coords, np.int32) log.debug("Cutting region %s from %s", coords, image_url) region_cut = cv2_image[np.min(poly[:, 1]):np.max(poly[:, 1]), np.min(poly[:, 0]):np.max(poly[:, 0])] return Image.fromarray(region_cut)
def test_page_from_file(self): f = OcrdFile(None, mimetype='image/tiff', local_filename=SAMPLE_IMG, ID='file1') self.assertEqual(f.mimetype, 'image/tiff') p = page_from_file(f) self.assertEqual(p.pcGtsId, f.ID) self.assertEqual(p.get_Page().imageWidth, 1457)
def download_url(self, url, **kwargs): """ Download a URL to the workspace. Args: url (string): URL to download to directory **kwargs : See :py:mod:`ocrd_models.ocrd_file.OcrdFile` Returns: The local filename of the downloaded file """ f = OcrdFile(None, url=url, **kwargs) f = self.download_file(f) return f.local_filename
def resolve_image_exif(self, image_url): """ Get the EXIF metadata about an image URL as :class:`OcrdExif` Args: image_url (string) : URL of image Return :class:`OcrdExif` """ f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url)) image_filename = self.download_file(f).local_filename with Image.open(image_filename) as pil_img: ocrd_exif = OcrdExif(pil_img) return ocrd_exif
def resolve_image_exif(self, image_url): """ Get the EXIF metadata about an image URL as :class:`OcrdExif` Args: image_url (string) : URL of image Return :class:`OcrdExif` """ files = self.mets.find_files(url=image_url) f = files[0] if files else OcrdFile(None, url=image_url) image_filename = self.download_file(f).local_filename if image_url not in self.image_cache['exif']: # FIXME must be in the right directory self.image_cache['exif'][image_url] = OcrdExif( Image.open(image_filename)) return self.image_cache['exif'][image_url]
def test_page_from_file_unsupported_mimetype(self): with self.assertRaisesRegex(ValueError, "Unsupported mimetype"): page_from_file( OcrdFile(None, local_filename=__file__, mimetype='foo/bar'))
def test_page_from_file_no_existe(self): with self.assertRaisesRegex(FileNotFoundError, "File not found: 'no-existe'"): page_from_file( OcrdFile(None, local_filename='no-existe', mimetype='foo/bar'))
def test_page_from_file_no_local_filename(self): with self.assertRaisesRegex( ValueError, "input_file must have 'local_filename' property"): page_from_file(OcrdFile(None, mimetype='image/tiff'))
def test_page_from_file_page(self): f = OcrdFile(None, mimetype=MIMETYPE_PAGE, local_filename=SAMPLE_PAGE) p = page_from_file(f) self.assertEqual(p.get_Page().imageWidth, 1457)
def test_fileGrp_wo_parent(self): f = OcrdFile(None) self.assertEqual(f.fileGrp, 'TEMP')
def test_set_url(self): f = OcrdFile(None) f.url = None f.url = 'http://foo' f.url = 'http://bar' self.assertEqual(f.url, 'http://bar')
def test_file_group_wo_parent(): with pytest.raises(ValueError) as val_err: OcrdFile(None) assert "not related to METS" in str(val_err.value)
def test_extension(self): f = OcrdFile(None, local_filename='/tmp/foo/bar/foo.bar') self.assertEqual(f.extension, '.bar')
def test_basename(self): f = OcrdFile(None, local_filename='/tmp/foo/bar/foo.bar') self.assertEqual(f.basename, 'foo.bar')
def test_set_id_none(self): f = OcrdFile(None) f.ID = 'foo12' self.assertEqual(f.ID, 'foo12') f.ID = None self.assertEqual(f.ID, 'foo12')
def test_constructor_url(self): f = OcrdFile(None, url="foo/bar") self.assertEqual(f.url, 'foo/bar') self.assertEqual(f.local_filename, 'foo/bar')
def _resolve_image_as_pil(self, image_url, coords=None): """ Resolve an image URL to a PIL image. Args: - coords (list) : Coordinates of the bounding box to cut from the image Returns: Image or region in image as PIL.Image """ log = getLogger('ocrd.workspace._resolve_image_as_pil') f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url)) image_filename = self.download_file(f).local_filename with pushd_popd(self.directory): pil_image = Image.open(image_filename) pil_image.load() # alloc and give up the FD # Pillow does not properly support higher color depths # (e.g. 16-bit or 32-bit or floating point grayscale), # clipping its dynamic range to the lower 8-bit in # many operations (including paste, putalpha, ImageStat...), # even including conversion. # Cf. Pillow#3011 Pillow#3159 Pillow#3838 (still open in 8.0) # So to be on the safe side, we must re-quantize these # to 8-bit via numpy (conversion to/from which fortunately # seems to work reliably): if (pil_image.mode.startswith('I') or pil_image.mode.startswith('F')): arr_image = np.array(pil_image) if arr_image.dtype.kind == 'i': # signed integer is *not* trustworthy in this context # (usually a mistake in the array interface) log.debug('Casting image "%s" from signed to unsigned', image_url) arr_image.dtype = np.dtype('u' + arr_image.dtype.name) if arr_image.dtype.kind == 'u': # integer needs to be scaled linearly to 8 bit # of course, an image might actually have some lower range # (e.g. 10-bit in I;16 or 20-bit in I or 4-bit in L), # but that would be guessing anyway, so here don't # make assumptions on _scale_, just reduce _precision_ log.debug('Reducing image "%s" from depth %d bit to 8 bit', image_url, arr_image.dtype.itemsize * 8) arr_image = arr_image >> 8 * (arr_image.dtype.itemsize - 1) arr_image = arr_image.astype(np.uint8) elif arr_image.dtype.kind == 'f': # float needs to be scaled from [0,1.0] to [0,255] log.debug('Reducing image "%s" from floating point to 8 bit', image_url) arr_image *= 255 arr_image = arr_image.astype(np.uint8) pil_image = Image.fromarray(arr_image) if coords is None: return pil_image # FIXME: remove or replace this by (image_from_polygon+) crop_image ... log.debug("Converting PIL to OpenCV: %s", image_url) color_conversion = cv2.COLOR_GRAY2BGR if pil_image.mode in ( '1', 'L') else cv2.COLOR_RGB2BGR pil_as_np_array = np.array(pil_image).astype( 'uint8') if pil_image.mode == '1' else np.array(pil_image) cv2_image = cv2.cvtColor(pil_as_np_array, color_conversion) poly = np.array(coords, np.int32) log.debug("Cutting region %s from %s", coords, image_url) region_cut = cv2_image[np.min(poly[:, 1]):np.max(poly[:, 1]), np.min(poly[:, 0]):np.max(poly[:, 0])] return Image.fromarray(region_cut)
def test_basename_without_extension_tar(self): f = OcrdFile(None, local_filename='/tmp/foo/bar/foo.tar.gz') self.assertEqual(f.basename_without_extension, 'foo')
def test_basename_from_url(self): f = OcrdFile(None, url="http://foo.bar/quux") self.assertEqual(f.basename, 'quux')
def test_page_from_file_unsupported_mimetype(self): with self.assertRaisesRegex(Exception, "Unsupported mimetype"): page_from_file(OcrdFile(None, mimetype='foo/bar'))